In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy.sparse.linalg import svds
import math

from recsys.preprocess import *

import functools

#from recsys.utility import *

#RANDOM_STATE = 666

#np.random.seed(RANDOM_STATE)

%matplotlib inline

In [None]:
def train_test_split(train, test_size=0.3, min_playlist_tracks=7):
    """
        Standard train_test_split, no modifications.
    """
    playlists = train[train.playlist_id.isin(target_playlists_original.playlist_id)].groupby('playlist_id').count()

    # Only playlists with at least "min_playlist_tracks" tracks are considered.
    # If "min_playlists_tracks" = 7, then 28311 out of 45649 playlists in "train" are considered.
    to_choose_playlists = playlists[playlists['track_id'] >= min_playlist_tracks].index.values


    # Among these playlists, "test_size * len(to_choose_playlists)" distinct playlists are chosen for testing.
    # If "test_size" = 0.3, then 8493 playlists are chosen for testing.
    # It's a numpy array that contains playlis_ids.
    target_playlists = np.random.choice(to_choose_playlists, replace=False, size=int(test_size * len(to_choose_playlists)))

    target_tracks = np.array([])
    indexes = np.array([])
    for p in target_playlists:
        # Choose 5 random tracks of such playlist: since we selected playlists with at least "min_playlist_tracks"
        # tracks, if "min_playlist_tracks" is at least 5, we are sure to find them.
        selected_df = train[train['playlist_id'] == p].sample(5)

        selected_tracks = selected_df['track_id'].values
        target_tracks = np.union1d(target_tracks, selected_tracks)
        indexes = np.union1d(indexes, selected_df.index.values)

    test = train.loc[indexes].copy()
    train = train.drop(indexes)

    return train, test, pd.DataFrame(target_playlists, columns=['playlist_id']), pd.DataFrame(target_tracks, columns=['track_id'])


In [None]:
import numpy as np
import scipy
from scipy.sparse import *
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

def dot_with_top(m1, m2, def_rows_g, top=-1, row_group=1, similarity="dot", shrinkage=0.000001, alpha=1):
    """
        Produces the product between matrices m1 and m2.
        Possible similarities: "dot", "cosine". By default it goes on "dot".
        NB: Shrinkage is not implemented...
        Code taken from
            https://stackoverflow.com/questions/29647326/sparse-matrix-dot-product-keeping-only-n-max-values-per-result-row
            and optimized for smart dot products.
    """
    m2_transposed = m2.transpose()
    
    l2 = m2.sum(axis=0) # by cols
    
    if top > 0:
        final_rows = []
        row_id = 0
        while row_id < m1.shape[0]:
            last_row = row_id + row_group if row_id + row_group <= m1.shape[0] else m1.shape[0]
            rows = m1[row_id:last_row]
            if rows.count_nonzero() > 0:
                if similarity == "cosine-old":
                    res_rows = cosine_similarity(rows, m2_transposed, dense_output=False)
                elif similarity == "cosine":
                    res_rows = csr_matrix((np.dot(rows,m2) / (np.sqrt(rows.sum(axis=1)) * np.sqrt(l2) + shrinkage)))
                elif similarity == "cosine-asym":
                    res_rows = csr_matrix((np.dot(rows,m2) / (np.power(rows.sum(axis=1),alpha) * np.power(m2.sum(axis=0),(1-alpha)) + shrinkage)))
                elif similarity == "dot-old":
                    res_rows = rows.dot(m2)
                else:
                    res_rows = (np.dot(rows,m2) + shrinkage).toarray()
                if res_rows.count_nonzero() > 0:
                    for res_row in res_rows:
                        if res_row.nnz > top:
                            args_ids = np.argsort(res_row.data)[-top:]
                            data = res_row.data[args_ids]
                            cols = res_row.indices[args_ids]
                            final_rows.append(csr_matrix((data, (np.zeros(top), cols)), shape=res_row.shape))
                        else:
                            args_ids = np.argsort(res_row.data)[-top:]
                            data = res_row.data[args_ids]
                            cols = res_row.indices[args_ids]
                            final_rows.append(csr_matrix((data, (np.zeros(len(args_ids)), cols)), shape=res_row.shape))
                            #print("Less than top: {0}".format(len(args_ids)))
                            #final_rows.append(def_rows_g[0])
                else:
                    print("Add empty 2")
                    for res_row in res_rows:
                        final_rows.append(def_rows_g[0])
            else:
                print("Add empty 3")
                final_rows.append(def_rows_g)
            row_id += row_group
            if row_id % row_group == 0:
                print(row_id)
        return scipy.sparse.vstack(final_rows, 'csr')
    return m1.dot(m2) 

In [None]:
def from_num_to_id(df, row_num, column = 'track_id'):
    """ df must have a 'track_id' column """
    return df.iloc[row_num][column]

def from_id_to_num(df, tr_id, column='track_id'):
    """ df must have a 'track_id' column """
    return np.where(df[column].values == tr_id)[0][0]

# Read data

In [None]:
train = pd.read_csv('data/train_final.csv', delimiter='\t')
playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
target_tracks = pd.read_csv('data/target_tracks.csv', delimiter = '\t')
tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

In [None]:
# We load them just to compare the ones for testing with the original ones.
# NB: we shouldn't use them in training!
train_original = pd.read_csv('data/train_final.csv', delimiter='\t')
target_playlists_original = pd.read_csv('data/target_playlists.csv', delimiter='\t')

In [None]:
len(train), len(target_playlists), len(target_tracks)

In [None]:
train, test, target_playlists, target_tracks = train_test_split(train, test_size=1, min_playlist_tracks=13)

In [None]:
len(train), len(test), len(target_playlists), len(target_tracks)

# Process data

In [None]:
len(tracks[tracks["album"] == "[None]"])

In [None]:
len(tracks[tracks["album"] == "[]"])

In [None]:
len(tracks[tracks["album"] == "-1"])

In [None]:
# Almost all of these were taken from one of your notebook, so you probably understand them
tracks['track_id_tmp'] = tracks['track_id']

tracks['track_id'] = tracks.index

playlists['playlist_id_tmp'] = playlists['playlist_id']
playlists['playlist_id'] = playlists.index

train['playlist_id_tmp'] = train['playlist_id']
train['track_id_tmp'] = train['track_id']

track_to_num = pd.Series(tracks.index)
track_to_num.index = tracks['track_id_tmp']

playlist_to_num = pd.Series(playlists.index)
playlist_to_num.index = playlists['playlist_id_tmp']

num_to_tracks = pd.Series(tracks['track_id_tmp'])

train['track_id'] = train['track_id'].apply(lambda x : track_to_num[x])
train['playlist_id'] = train['playlist_id'].apply(lambda x : playlist_to_num[x])

tracks.tags = tracks.tags.apply(lambda s: np.array(eval(s), dtype=int))

playlists.title = playlists.title.apply(lambda s: np.array(eval(s), dtype=int))

target_playlists['playlist_id_tmp'] = target_playlists['playlist_id']
target_playlists['playlist_id'] = target_playlists['playlist_id'].apply(lambda x : playlist_to_num[x])

target_tracks['track_id_tmp'] = target_tracks['track_id']
target_tracks['track_id'] = target_tracks['track_id'].apply(lambda x : track_to_num[x])

# Create a dataframe that maps a playlist to the set of its tracks
playlist_tracks = pd.DataFrame(train['playlist_id'].drop_duplicates())
playlist_tracks.index = train['playlist_id'].unique()
playlist_tracks['track_ids'] = train.groupby('playlist_id').apply(lambda x : x['track_id'].values)
playlist_tracks = playlist_tracks.sort_values('playlist_id')

# Create a dataframe that maps a track to the set of the playlists it appears into
track_playlists = pd.DataFrame(train['track_id'].drop_duplicates())
track_playlists.index = train['track_id'].unique()
track_playlists['playlist_ids'] = train.groupby('track_id').apply(lambda x : x['playlist_id'].values)
track_playlists = track_playlists.sort_values('track_id')

def transform_album_pr(alb):
    global bad_albums
    ar = eval(alb)
    if len(ar) == 0:
        return 2
    elif ar[0] == None:
        return 3
    return 1

tracks["album_presence"] = tracks.album.apply(lambda alb: transform_album_pr(alb))

# Substitute each bad album (i.e. an illformed album such as -1, None, etc) with the 0 album
bad_albums = 0
def transform_album_1(alb):
    global bad_albums
    ar = eval(alb)
    if len(ar) == 0 or (len(ar) > 0 and (ar[0] == None or ar[0] == -1)):
        ar = [0]
        bad_albums += 1
    return ar[0]

tracks.album = tracks.album.apply(lambda alb: transform_album_1(alb))

In [None]:
tracks.head()

## Recover albums
Choose one of the following:<br>
1 - fill with most similar albums according to the URM<br>
2 - fill with brand new albums 

#### Fill with most similar albums according to the URM

In [None]:
def get_UAM_album(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_albums = tracks.album.unique()
    
    i = 0

    UAM_album = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    UAM_album_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    album_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            alb = tracks.loc[tr_id].album
            UAM_album[pl_id,alb] += 1
            UAM_album_no_norm[pl_id,alb] += 1
            if alb not in album_to_playlists:
                album_to_playlists[alb] = [pl_id]
            else:
                album_to_playlists[alb].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    album_to_val = {}
    if norm == "okapi" or norm == "idf" or norm == "tf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)
        
        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            albums = UAM_album.rows[pl_id]
            data = UAM_album.data[pl_id]
            for album in albums:
                fq = UAM_album[pl_id,album]
                nq = len(album_to_playlists[album])
                idf = math.log(500/(nq + 0.5))
                
                if album not in album_to_val:
                    album_to_val[album] = idf
                    
                if norm == "idf":
                    UAM_album[pl_id,album] = idf
                elif norm == "okapi":
                    UAM_album[pl_id,album] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
                elif norm == "tf":
                    UAM_album[pl_id,album] = (fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM_album, UAM_album_no_norm, album_to_val

In [None]:
# Substitute each album with the most similar album according to playlist frequencies
UAM_album, UAM_album_no_norm, album_to_val = get_UAM_album(tracks, playlist_tracks, target_playlists, norm="idf")

In [None]:
tracks.tail()

In [None]:
tracks["album_corrected"] = tracks["album"]

In [None]:
tracks.tail()

In [None]:
def transform_album_sim(tr_id):
    tot = np.zeros((1,max(tracks.album)+1))[0]
    for pl_id in track_playlists.loc[tr_id].playlist_ids:
        ar = UAM_album_no_norm[pl_id].toarray()[0]
        tot += np.log(ar + 1)  
        #tot += ar.clip(max=1)
    if tot.max() != 0:
        best_1 = tot.argmax()
        best_2 = tot.argpartition(len(tot)-2)[-2]
        if best_1 == 0:
            return best_2
    return 0

corrected_albums = 0
for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
    if row.album_corrected == 0:
        new_album = transform_album_sim(row.track_id)
        if new_album != 0:
            tracks.set_value(row.track_id, "album_corrected", new_album)
            corrected_albums += 1
            if corrected_albums % 100 == 0:
                print(corrected_albums)

In [None]:
tracks.tail()

#### Fill with brand new albums

In [None]:
# Substitute each 0 album with a brand new album
def transform_album_2(alb):
    global next_album_id
    if alb == 0:
        alb = next_album_id
        next_album_id += 1
    return alb
last_album = tracks.album.max()
next_album_id = last_album + 1
tracks.album = tracks.album.apply(lambda alb: transform_album_2(alb))

In [None]:
len(tracks[tracks.album == 0])

## Recover tags according to URM

In [None]:
tracks["tags_corrected"] = tracks["tags"]

In [None]:
# Count distinct tags
tag_tracks = {}
for row in tracks.itertuples():
    for tag in row.tags:
        if tag in tag_tracks:
            tag_tracks[tag].append(row.track_id)
        else:
            tag_tracks[tag] = [row.track_id]

In [None]:
# User Tag Matrix UTM
def get_UTM(tracks, playlist_tracks, tag_tracks, norm="no", OKAPI_K=1.7, OKAPI_B=0.75, best_tag=False):
    """
        Possible norm are "no", "okapi", "idf", "tf". Default to "no".
    """
    
    if best_tag:
        unique_tags = list(best_tag_tracks.keys())
    else:
        unique_tags = list(tag_tracks.keys())
    
    i = 0

    UTM = lil_matrix((max(playlists.playlist_id)+1, max(unique_tags)+1))
    UTM_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_tags)+1))
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            tr_row = tracks.loc[tr_id]
            if best_tag:
                UTM[pl_id,tr_row.best_tag] += 1
                UTM_no_norm[pl_id,tr_row.best_tag] += 1
            else:
                for tag in tr_row.tags:
                    UTM[pl_id,tag] += 1
                    UTM_no_norm[pl_id,tag] += 1
                
        i += 1
        if i % 1000 == 0:
            print(i)
            
    if norm == "okapi" or norm == "idf" or norm == "tf":
        avg_document_length = sum(list(map(lambda l: sum(l), UTM.data)))/len(UTM.data)

        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            tags = UTM.rows[pl_id]
            data = UTM.data[pl_id]
            for tag in tags:
                fq = UTM[pl_id,tag]
                if best_tag:
                    nq = len(best_tag_tracks[tag])
                else:
                    nq = len(tag_tracks[tag])
                idf = math.log(28000/(nq + 0.5))
                
                if norm == "idf":
                    UTM[pl_id,tag] = idf
                elif norm == "okapi":
                    UTM[pl_id,tag] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
                elif norm == "tf":
                    UTM[pl_id,tag] = (fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
                    
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UTM, UTM_no_norm

In [None]:
UTM, UTM_no_norm = get_UTM(tracks, playlist_tracks, tag_tracks, norm="okapi", best_tag=False)

In [None]:
def get_tags_sim(tr_id):
    tot = csr_matrix((1,max(tag_tracks)+1))
    tr_row = track_playlists.loc[tr_id]
    for pl_id in tr_row.playlist_ids:
        tot += UTM[pl_id]
    tot = tot.toarray()[0]
    return tot.argsort()[::-1][0:5]
    

corrected_tags = 0
for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
    if len(row.tags) == 0:
        new_tags = get_tags_sim(row.track_id)
        tracks.set_value(row.track_id, "tags_corrected", new_tags)
        
        corrected_tags += 1
        if corrected_tags % 100 == 0:
            print(corrected_tags)

In [None]:
tracks.tail()

# "Training"

## Item-item similarity using only URM

In [None]:
def sigmoid(gamma):
    if gamma < 0:
        return 1 - 1/(1 + math.exp(gamma))
    else:
        return 1/(1 + math.exp(-gamma))

# User Rating Matrix URM
def get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no", pow_base=500, pow_exp=0.15):
    """
        possible normalizations: "no", "idf", "sqrt", "pow", "atan".
        Default "no".
    """
    URM = lil_matrix((len(playlists), len(tracks)))
    num_playlists = len(playlist_tracks)

    i = 0
    
    for row in track_playlists.itertuples():
        track_id = row.track_id
        nq = len(row.playlist_ids)
        for pl_id in row.playlist_ids:
            if norm == "idf":
                URM[pl_id,track_id] = math.log((500)/nq)
            elif norm == "sqrt":
                URM[pl_id,track_id] = math.sqrt((500)/nq)
            elif norm == "pow":
                URM[pl_id,track_id] = math.pow((pow_base)/nq, pow_exp)
            elif norm == "atan":
                URM[pl_id,track_id] = 3 + 1*math.atan(-0.1*nq + 1)
            else:
                URM[pl_id,track_id] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
    
    return URM

#
# URM:
# 
#              tracks
#            _________
#           \         \
# playlists \         \
#           \_________\
#

In [None]:
URM_normalize = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no")

In [None]:
URM_pow = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="pow", pow_base=500, pow_exp=0.15)

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, URM_normalize.shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
TTM_cosine = dot_with_top(URM_normalize.transpose(), URM_normalize, def_rows_i, top=50, row_group=row_group, similarity="cosine-old")

In [None]:
row_group = 1000
def_rows_i = csr_matrix((row_group, URM_pow.shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
TTM_dot = dot_with_top(URM_pow.transpose(), URM_pow, def_rows_i, top=50, row_group=row_group, similarity="dot-old")

## Item-item similarity starting from a user-user similarity using only the URM

In [None]:
row_group = 1000
def_rows_i = csr_matrix((row_group, URM_normalize.transpose().shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
UUM_cosine = dot_with_top(URM_normalize, URM_normalize.transpose(), def_rows_i, top=500, row_group=row_group, similarity="cosine-old")

In [None]:
row_group = 1000
def_rows_i = csr_matrix((row_group, UUM_cosine.transpose().shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
URM_UUM_cosine = dot_with_top(UUM_cosine, URM_normalize, def_rows_i, top=500, row_group=row_group, similarity="cosine-old")

In [None]:
row_group = 1000
def_rows_i = csr_matrix((row_group, URM_UUM_cosine.shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
TTM_UUM_cosine = dot_with_top(URM_UUM_cosine.transpose(), URM_UUM_cosine, def_rows_i, top=50, row_group=row_group, similarity="cosine-old")

In [None]:
"""
# Calibration
def calibrate_predictions(pred, theta=0.5):
    max_r = np.amax(pred, axis=0)
    mean_r = np.mean(pred, axis=0)

    pred_coo = pred.tocoo()
    pred_csr = pred.tocsr()
    max_r_csr = max_r.tocsr()

    counter = 0
    for i,j,v in zip(pred_coo.row, pred_coo.col, pred_coo.data):
        if v >= max_r_csr[0,j]:
            pred_csr[i,j] = 1
        elif v >= mean_r[0,j]:
            pred_csr[i,j] = theta + (1 - theta)*((v - mean_r[0,j])/(max_r_csr[0,j] - mean_r[0,j]))
        else:
            pred_csr[i,j] = theta * v / mean_r[0,j]
        counter += 1
        if counter % 10000 == 0:
            print("{0} out of {1}".format(counter, len(pred.data)))
    
    return pred_csr"""

## Album

#### using non-corrected album

In [None]:
unique_albums = tracks.album.unique()
unique_albums

In [None]:
album_tracks = {}
for row in tracks.itertuples():
    if row.album in album_tracks:
        album_tracks[row.album].append(row.track_id)
    else:
        album_tracks[row.album] = [row.track_id]

In [None]:
def get_IAM_album(tracks, target_tracks, norm="no"):
    """
        Possible norms are "no", "idf".
        Default "no".
    """
    unique_albums = tracks.album.unique()
    IAM_album = lil_matrix((len(tracks), max(unique_albums)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    for row in tracks.itertuples():
        if norm == "idf":
            nq = len(album_tracks[row.album])
            IAM_album[row.track_id,row.album] = math.log(500/(nq + 10))
        else:
            IAM_album[row.track_id,row.album] = 1
        if i % 100 == 0:
            print(i)
        i += 1
    
    return IAM_album

In [None]:
IAM_album = get_IAM_album(tracks, target_tracks, norm="no")

In [None]:
SYM_ALBUM = IAM_album.dot(IAM_album.transpose())

## Artist
Same steps as for Album

In [None]:
unique_artists = tracks.artist_id.unique()

In [None]:
artist_tracks = {}
for row in tracks.itertuples():
    if row.artist_id in artist_tracks:
        artist_tracks[row.artist_id].append(row.track_id)
    else:
        artist_tracks[row.artist_id] = [row.track_id]

In [None]:
# Item Artist Matrix
def get_IAM(tracks, target_tracks, norm="no", n_best=5):
    """
        Possible norms are "no", "idf". Default to "no".
    """
    unique_artists = tracks.artist_id.unique()
    IAM = lil_matrix((len(tracks), max(unique_artists)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    for row in tracks.itertuples():
        if norm == "idf":
            nq = len(artist_tracks[row.artist_id])
            IAM[row.track_id,row.artist_id] = math.log(500/(nq + 0.5))
        else:
            IAM[row.track_id,row.artist_id] = 1
            
        if i % 1000 == 0:
            print(i)
        i += 1
    
    return IAM

In [None]:
# Step 2
IAM = get_IAM(tracks, target_tracks, norm="no")

In [None]:
# Step 3
SYM_ARTIST = IAM.dot(IAM.transpose())

# Tags

In [None]:
# Count distinct tags
tag_tracks = {}
for row in tracks.itertuples():
    for tag in row.tags:
        if tag in tag_tracks:
            tag_tracks[tag].append(row.track_id)
        else:
            tag_tracks[tag] = [row.track_id]

In [None]:
# Item Tag Matrix ITM
def get_ITM(tracks, tag_tracks, norm="no", best_tag=False):
    """
        Possible norm are "no", "idf" and sqrt". Default to "no".
    """
    if best_tag:
        unique_tags = list(best_tag_tracks.keys())
    else:
        unique_tags = list(tag_tracks.keys())
    ITM = lil_matrix((len(tracks), max(unique_tags)+1))
    
    i = 0
    
    tag_dict = tag_tracks
        
    for tag,track_ids in tag_dict.items():
        nq = len(track_ids)
        for track_id in track_ids:
            if norm == "idf":
                ITM[track_id,tag] = math.log(500/(nq + 1))
            elif norm == "sqrt":
                ITM[track_id,tag] = math.sqrt(500/(nq + 1))
            else:
                ITM[track_id,tag] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
    
    return ITM

In [None]:
ITM = get_ITM(tracks, tag_tracks, norm="no", best_tag=False)

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, ITM.shape[0])) # this is needed to fill some rows that would be all zeros otherwise...
SYM_TAG = dot_with_top(ITM, ITM.transpose(), def_rows_i, top=50, row_group=row_group, similarity="cosine-old")

# Other similarities...

#### SYM_ALBUM_COMPLEX

In [None]:
row_group = 1000
def_rows_i = csr_matrix((row_group, SYM_ALBUM.shape[0])) # this is needed to fill some rows that would be all zeros otherwise...
TR_PL_ALBUM = dot_with_top(SYM_ALBUM, URM_normalize.transpose(), def_rows_i, top=200, row_group=row_group, similarity="cosine-old")

row_group = 1000
def_rows_i = csr_matrix((row_group, TR_PL_ALBUM.shape[0])) # this is needed to fill some rows that would be all zeros otherwise...
SYM_ALBUM_COMPLEX = dot_with_top(TR_PL_ALBUM, TR_PL_ALBUM.transpose(), def_rows_i, top=50, row_group=row_group, similarity="cosine-old")

#### SYM_ARTIST_COMPLEX

In [None]:
row_group = 1000
def_rows_i = csr_matrix((row_group, SYM_ARTIST.shape[0])) # this is needed to fill some rows that would be all zeros otherwise...
TR_PL_ARTIST = dot_with_top(SYM_ARTIST, URM_normalize.transpose(), def_rows_i, top=200, row_group=row_group, similarity="cosine-old")

row_group = 1000
def_rows_i = csr_matrix((row_group, TR_PL_ARTIST.shape[0])) # this is needed to fill some rows that would be all zeros otherwise...
SYM_ARTIST_COMPLEX = dot_with_top(TR_PL_ARTIST, TR_PL_ARTIST.transpose(), def_rows_i, top=50, row_group=row_group, similarity="cosine-old")

#### Owner

In [None]:
tracks['owners'] = [np.array([]) for i in range(len(tracks))]

for row in track_playlists.itertuples():
    tr_id = row.track_id
    owners = np.array([])
    for pl_id in row.playlist_ids:
        owners = np.concatenate((owners, [playlists.loc[pl_id].owner]))
    tracks.set_value(tr_id, 'owners', owners)

In [None]:
owner_tracks = {}
for row in tracks.itertuples():
    for owner in row.owners:
        if owner in owner_tracks:
            owner_tracks[owner].append(row.track_id)
        else:
            owner_tracks[owner] = [row.track_id]

In [None]:
unique_owners = list(owner_tracks.keys())
OTM = lil_matrix((len(tracks), max(unique_owners)+1))

i = 0

owner_dict = owner_tracks
norm = "no"

for owner,track_ids in owner_dict.items():
    nq = len(track_ids)
    for track_id in track_ids:
        if norm == "idf":
            OTM[track_id,owner] += math.log(500/(nq + 1))
        elif norm == "sqrt":
            OTM[track_id,owner] += math.sqrt(500/(nq + 1))
        else:
            OTM[track_id,owner] += 1
    if i % 1000 == 0:
        print(i)
    i += 1

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, OTM.shape[0])) # this is needed to fill some rows that would be all zeros otherwise...
SYM_OWNERS = dot_with_top(OTM, OTM.transpose(), def_rows_i, top=50, row_group=row_group, similarity="cosine-old")

#### playlist titles

In [None]:
tracks['title_tokens'] = [np.array([]) for i in range(len(tracks))]

for row in track_playlists.itertuples():
    tr_id = row.track_id
    titles = np.array([])
    for pl_id in row.playlist_ids:
        titles = np.concatenate((titles, playlists.loc[pl_id].title))
    tracks.set_value(tr_id, 'title_tokens', titles)

In [None]:
title_tracks = {}
for row in tracks.itertuples():
    for title_token in row.title_tokens:
        if title_token in title_tracks:
            title_tracks[title_token].append(row.track_id)
        else:
            title_tracks[title_token] = [row.track_id]

In [None]:
unique_titles = list(title_tracks.keys())
TTM_title = lil_matrix((len(tracks), max(unique_titles)+1))

i = 0

title_dict = title_tracks
norm = "no"

for title,track_ids in title_dict.items():
    nq = len(track_ids)
    for track_id in track_ids:
        if norm == "idf":
            TTM_title[track_id,title] += math.log(500/(nq + 1))
        elif norm == "sqrt":
            TTM_title[track_id,title] += math.sqrt(500/(nq + 1))
        else:
            TTM_title[track_id,title] += 1
    if i % 1000 == 0:
        print(i)
    i += 1

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, TTM_title.shape[0])) # this is needed to fill some rows that would be all zeros otherwise...
SYM_TITLE = dot_with_top(TTM_title, TTM_title.transpose(), def_rows_i, top=50, row_group=row_group, similarity="cosine-old")

#### SVD

In [None]:
from scipy.sparse.linalg import svds

In [None]:
%%time
U, S, V = svds(URM_normalize, k=20)

In [None]:
S = np.diag(S)

In [None]:
M2 = np.dot(S, V)

# Predictions

In [None]:
def from_prediction_matrix_to_dataframe(pred_matrix, target_playlists, keep_best=5,
                                       num_to_tracks={}, map_tracks=False):
    pred_matrix_csr = pred_matrix.tocsr()

    predictions = pd.DataFrame(target_playlists[:pred_matrix.shape[0]])
    predictions.index = target_playlists['playlist_id'][:pred_matrix.shape[0]]
    predictions['track_ids'] = [np.array([]) for i in range(len(predictions))]

    for target_row,pl_id in enumerate(target_playlists.playlist_id[:pred_matrix.shape[0]]):
        row_start = pred_matrix_csr.indptr[target_row]
        row_end = pred_matrix_csr.indptr[target_row+1]
        row_columns = pred_matrix_csr.indices[row_start:row_end]
        row_data = pred_matrix_csr.data[row_start:row_end]

        best_indexes = row_data.argsort()[::-1][:keep_best]
        
        pred = row_columns[best_indexes]
        
        if map_tracks:
            pred = np.array([num_to_tracks[t] for t in pred])

        predictions.loc[pl_id] = predictions.loc[pl_id].set_value('track_ids', pred)
        
    return predictions

In [None]:
class SVDPredictor:
    def __init__(self, name, m1, m2):
        self.name = name
        self.m1 = m1
        self.m2 = m2
        self.predictions = csr_matrix((0, self.m2.shape[1]))
        self.maps = []
        
    def predict_group(self, row_start, row_end, target_playlists, target_tracks, keep_best=5,
                      compute_MAP=False, test_good=None):
        if not hasattr(self, 'ttracks'):
            self.ttracks = list(set(target_tracks['track_id'].values))
            
        pl_group = target_playlists[row_start:row_end]

        rows_URM = []
        for pl_id in pl_group.playlist_id:
            rows_URM += [csr_matrix(self.m1[pl_id,:])]
        composed_URM = scipy.sparse.vstack(rows_URM, 'csr')

        simil = np.array(composed_URM.dot(self.m2))
        simil_to_save = simil.copy()

        for i,pl_id in enumerate(pl_group.playlist_id):
            row = simil[i]
            pl_tracks = list(set(playlist_tracks.loc[pl_id]['track_ids']))
            best_indexes = row.argsort()[::-1]
            best_indexes = best_indexes[np.in1d(best_indexes, self.ttracks)] # keep only tracks that are in target_tracks
            best_indexes = best_indexes[~np.in1d(best_indexes, pl_tracks)] # remove tracks that are already in the playlist
            best_indexes = best_indexes[:keep_best] # keep only the best 
            new_row = np.zeros(len(row))
            new_row[best_indexes] = row[best_indexes]
            new_row_to_save = np.zeros(len(row))
            new_row_to_save[best_indexes[:5]] = row[best_indexes[:5]]
            simil[i] = new_row
            simil_to_save[i] = new_row_to_save
        
        self.predictions = scipy.sparse.vstack([self.predictions, simil_to_save], 'csr')
        
        return csr_matrix(simil)
    
    def print_MAP(self, test_good, target_playlists, num_to_tracks):
         predictions = from_prediction_matrix_to_dataframe(self.predictions, target_playlists, keep_best=5, num_to_tracks=num_to_tracks, map_tracks=True)
         current_map = util.evaluate(test_good, predictions, should_transform_test=False)
         print("{0}: {1}".format(self.name, current_map))
         self.maps.append(current_map)
        
    def get_predictors(self):
        return [self]

In [None]:
class SimilarityPredictor:
    def __init__(self, name, urm, similarity):
        self.name = name
        self.urm = urm
        self.similarity = similarity
        self.predictions = csr_matrix((0, self.urm.shape[1]))
        self.maps = []
        
    def predict_group(self, row_start, row_end, target_playlists, target_tracks, keep_best=5,
                      compute_MAP=False, test_good=None):
        if not hasattr(self, 'ttracks'):
            self.ttracks = list(set(target_tracks['track_id'].values))
        
        # "pl_group" is the set of the playlists that we want to make prediction for
        pl_group = target_playlists[row_start:row_end]

        rows_URM = []
        for pl_id in pl_group.playlist_id:
            rows_URM += [self.urm[pl_id,:]]
        composed_URM = scipy.sparse.vstack(rows_URM, 'csr')

        simil = np.array(np.divide(self.similarity.dot(composed_URM.transpose()).transpose().todense(), self.similarity.sum(axis=1).transpose() + 1))
        simil_to_save = simil.copy()

        for i,pl_id in enumerate(pl_group.playlist_id):
            row = simil[i]
            pl_tracks = list(set(playlist_tracks.loc[pl_id]['track_ids']))
            best_indexes = row.argsort()[::-1]
            best_indexes = best_indexes[np.in1d(best_indexes, self.ttracks)] # keep only tracks that are in target_tracks
            best_indexes = best_indexes[~np.in1d(best_indexes, pl_tracks)] # remove tracks that are already in the playlist
            best_indexes = best_indexes[:keep_best] # keep only the best 
            new_row = np.zeros(len(row))
            new_row[best_indexes] = row[best_indexes]
            new_row_to_save = np.zeros(len(row))
            new_row_to_save[best_indexes[:5]] = row[best_indexes[:5]]
            simil[i] = new_row
            simil_to_save[i] = new_row_to_save
        
        self.predictions = scipy.sparse.vstack([self.predictions, simil_to_save], 'csr')
        
        return csr_matrix(simil)
    
    def print_MAP(self, test_good, target_playlists, num_to_tracks):
         predictions = from_prediction_matrix_to_dataframe(self.predictions, target_playlists, keep_best=5, num_to_tracks=num_to_tracks, map_tracks=True)
         current_map = util.evaluate(test_good, predictions, should_transform_test=False)
         print("{0}: {1}".format(self.name, current_map))
         self.maps.append(current_map)
        
    def get_predictors(self):
        return [self]

In [None]:
import functools

class SumEnsemblePredictor:
    def __init__(self, name, predictors, original_urm, weights=[]):
        self.name = name
        self.predictors = predictors
        self.predictions = csr_matrix((0, original_urm.shape[1]))
        self.original_urm = original_urm
        if len(weights) == 0:
            self.weights = [1 for p in predictors]
        else:
            self.weights = weights
        self.maps = []
        
    def predict_group(self, row_start, row_end, target_playlists, target_tracks, keep_best=5, compute_MAP=False, test_good=None):
        # "pl_group" is the set of the playlists that we want to make prediction for
        pl_group = target_playlists[row_start:row_end]

        # check fast mode: only if ensemble of SimilarityPredictor
        ok_fast = False
        for predictor in self.predictors:
            if type(predictor) is not SimilarityPredictor or predictor.urm is not self.predictors[0].urm:
                ok_fast = False
                break
                
        if ok_fast:
            if not hasattr(self, 'fast_predictor'):
                print("Using SimilarityPredictor fast mode for {0}".format(self.name))
                symilarities = [p.similarity for p in self.predictors]
                fast_sym = self.weights[0] * symilarities[0].tolil()
                for i,s in enumerate(symilarities[1:]):
                    fast_sym += self.weights[i+1] * s.tolil()
                self.fast_predictor = SimilarityPredictor("fast_predictor", predictors[0].urm, fast_sym)
            
            res_urm = self.fast_predictor.predict_group(row_start, row_end, target_playlists, target_tracks, keep_best=self.original_urm.shape[1],
                                                        compute_MAP=False, test_good=None)
        
        else:
            predictions = []
            for predictor in self.predictors:
                pred = predictor.predict_group(row_start, row_end, target_playlists, target_tracks, keep_best=self.original_urm.shape[1],
                                                compute_MAP=compute_MAP, test_good=test_good)
                if compute_MAP:
                    predictor.print_MAP(test_good, target_playlists, num_to_tracks)
                predictions.append(pred)
            
            res_urm = self.weights[0] * predictions[0].tolil()
            for i,p in enumerate(predictions[1:]):
                res_urm += self.weights[i+1] * p.tolil()
            
        res_urm_to_save = res_urm.copy()
        
        res_urm = res_urm.tolil()
        res_urm_to_save = res_urm_to_save.tolil()
            
        for i,pl_id in enumerate(pl_group.playlist_id):
            row = res_urm[i].toarray()[0]
            best_indexes = row.argsort()[::-1]
            best_indexes = best_indexes[:keep_best] # keep only the best
            new_row = np.zeros(len(row))
            new_row[best_indexes] = row[best_indexes]
            new_row_to_save = np.zeros(len(row))
            new_row_to_save[best_indexes[:5]] = row[best_indexes[:5]]
            res_urm[i] = new_row
            res_urm_to_save[i] = new_row_to_save
        
        self.predictions = scipy.sparse.vstack([self.predictions, res_urm_to_save], 'csr')
        
        return res_urm
    
    def print_MAP(self, test_good, target_playlists, num_to_tracks):
         predictions = from_prediction_matrix_to_dataframe(self.predictions, target_playlists, keep_best=5, num_to_tracks=num_to_tracks, map_tracks=True)
         current_map = util.evaluate(test_good, predictions, should_transform_test=False)
         print("{0}: {1}".format(self.name, current_map))
         self.maps.append(current_map)
    
    def get_predictors(self):
        return self.predictors + [self]

In [None]:
from random import uniform

def get_random_choice_with_probabilities(probabilities):
    r = uniform(0,1)
    acc = 0
    for i,p in enumerate(probabilities):
        acc += p
        if r <= acc:
            return i
    return -1

class StochasticEnsemblePredictor:
    def __init__(self, name, predictors, original_urm, probabilities=[]):
        self.name = name
        self.predictors = predictors
        self.predictions = csr_matrix((0, original_urm.shape[1]))
        self.original_urm = original_urm
        self.probabilities = [p/sum(probabilities) for p in probabilities]
        self.maps = []
        
    def predict_group(self, row_start, row_end, target_playlists, target_tracks, keep_best=5, compute_MAP=False, test_good=None):
        # "pl_group" is the set of the playlists that we want to make prediction for
        pl_group = target_playlists[row_start:row_end]

        predictions = []
        for predictor in self.predictors:
            pred = predictor.predict_group(row_start, row_end, target_playlists, target_tracks, keep_best=original_urm.shape[1],
                                            compute_MAP=compute_MAP, test_good=test_good)
            if compute_MAP:
                predictor.print_MAP(test_good, target_playlists, num_to_tracks)
            predictions.append(pred)
        
        res_urm = lil_matrix(predictions[0].shape)
        res_urm_to_save = res_urm.copy()

        for i in range(predictions[0].shape[0]):
            best_indexes = []
            for p in predictions:
                row = p[i].toarray()[0]
                best_indexes.append(row.argsort()[::-1])
            counters = [0 for j in best_indexes]

            res_indexes = []
            for j in range(0,keep_best):
                c = get_random_choice_with_probabilities(self.probabilities)
                new_index = best_indexes[c][counters[c]]
                counters[c] += 1
                while new_index in res_indexes:
                    new_index = best_indexes[c][counters[c]]
                    counters[c] += 1
                res_indexes.append(new_index)


            new_row = np.zeros(len(row))
            new_row_to_save = np.zeros(len(row))
            for j,idx in enumerate(res_indexes):
                new_row[idx] = keep_best-j
                if j < 5:
                    new_row_to_save[idx] = keep_best-j

            res_urm[i] = new_row
            res_urm_to_save[i] = new_row_to_save
        
        self.predictions = scipy.sparse.vstack([self.predictions, res_urm_to_save], 'csr')
        
        return res_urm
    
    def print_MAP(self, test_good, target_playlists, num_to_tracks):
         predictions = from_prediction_matrix_to_dataframe(self.predictions, target_playlists, keep_best=5, num_to_tracks=num_to_tracks, map_tracks=True)
         current_map = util.evaluate(test_good, predictions, should_transform_test=False)
         print("{0}: {1}".format(self.name, current_map))
         self.maps.append(current_map)
    
    def get_predictors(self):
        return self.predictors + [self]

In [None]:
from recsys import utility as util
import matplotlib.patches as mpatches

def make_predictions(predictor, original_urm, target_playlists, target_tracks,
                     row_group=1000,
                     compute_MAP=False, test=None, num_to_tracks="nope", graph_name="MAP"):
    if compute_MAP:
        test_good = get_playlist_track_list2(test)
        test_good.index = test_good.playlist_id.apply(lambda pl_id: playlist_to_num[pl_id])
    else:
        test_good = test
    
    res_urm = csr_matrix((0, original_urm.shape[1]))

    row_start = 0
    while row_start < len(target_playlists):
        # We'll do dot products for all playlists in "target_playlists" from "row_start" to "row_end"
        row_end = row_start + row_group if row_start + row_group <= len(target_playlists) else len(target_playlists)
        
        print("From {0} to {1}:".format(row_start, row_end))

        simil_urm = predictor.predict_group(row_start, row_end, target_playlists, target_tracks, keep_best=5,
                                            compute_MAP=compute_MAP, test_good=test_good)
        if compute_MAP:
            predictor.print_MAP(test_good, target_playlists, num_to_tracks)
            
        print()

        res_urm = scipy.sparse.vstack([res_urm, simil_urm], 'csr')
        row_start = row_end
        
    if compute_MAP:
        # show MAP graph
        bins = int(len(target_playlists)/row_group)+1
        x = [i/bins for i in range(0,bins)]
        colors = ["red", "green", "blue", "orange", "m", "gold", "c", "navy", "sienna", "grey"]
        predictors = predictor.get_predictors()
        maps = [p.maps if len(p.maps)>0 else [0 for b in range(0,bins)] for p in predictors]
        patches = []
        for i,m in enumerate(maps):
            plt.plot(x, m, colors[i])
            patches.append(mpatches.Patch(color=colors[i], label=predictors[i].name))
        plt.legend(handles=patches)

        plt.xlabel('percentage of playlists considered')
        plt.ylabel('MAP')
        plt.grid(True)
        plt.savefig(graph_name + ".png")
        plt.show()
        

    predictions = from_prediction_matrix_to_dataframe(res_urm, target_playlists, keep_best=5, num_to_tracks=num_to_tracks, map_tracks=True)
    
    return predictions

In [None]:
TTM_dot_copy = TTM_dot.copy()
TTM_cosine_copy = TTM_cosine.copy()
TTM_UUM_cosine_copy = TTM_UUM_cosine.copy()
SYM_ALBUM_copy = SYM_ALBUM.copy()
SYM_ARTIST_copy = SYM_ARTIST.copy()
SYM_OWNERS_copy = SYM_OWNERS.copy()

In [None]:
TTM_dot_copy = TTM_dot.copy()
TTM_cosine_copy = TTM_cosine.copy()
TTM_UUM_cosine_copy = TTM_UUM_cosine.copy()
SYM_OWNERS_copy = SYM_OWNERS.copy()

In [None]:
TTM_dot = TTM_dot_copy.copy()
TTM_cosine = TTM_cosine_copy.copy()
TTM_UUM_cosine = TTM_UUM_cosine_copy.copy()
SYM_ALBUM = SYM_ALBUM_copy.copy()
SYM_ARTIST = SYM_ARTIST_copy.copy()
SYM_OWNERS = SYM_OWNERS_copy.copy()

In [None]:
from sklearn.preprocessing import normalize

TTM_dot = normalize(TTM_dot, norm='l2', axis=0)
TTM_cosine = normalize(TTM_cosine, norm='l2', axis=0)
TTM_UUM_cosine = normalize(TTM_UUM_cosine, norm='l2', axis=0)
SYM_ALBUM = normalize(SYM_ALBUM, norm='l1', axis=0)
SYM_ARTIST = normalize(SYM_ARTIST, norm='l2', axis=0)
SYM_OWNERS =  normalize(SYM_OWNERS, norm='l2', axis=0)

In [None]:
ii_1 = SimilarityPredictor("ii_1", URM_pow, TTM_dot)
ii_2 = SimilarityPredictor("ii_2", URM_pow, TTM_cosine)
ii_3 = SimilarityPredictor("ii_3", URM_pow, TTM_UUM_cosine)
album_predictor = SimilarityPredictor("album", URM_pow, SYM_ALBUM)
artist_predictor = SimilarityPredictor("artist", URM_pow, SYM_ARTIST)
owner_predictor = SimilarityPredictor("owner", URM_pow, SYM_OWNERS)
#tag_predictor = SimilarityPredictor("tag", URM_pow, SYM_TAG)
#title_predictor = SimilarityPredictor("title", URM_pow, SYM_TITLE)
svd_predictor = SVDPredictor("svd", U, M2)

predictors = [ii_1, ii_3, album_predictor, artist_predictor, owner_predictor, svd_predictor]
final_predictor = SumEnsemblePredictor("final_ens", predictors, URM_normalize)

In [None]:
make_predictions(final_predictor, URM_normalize, target_playlists, target_tracks,
                 row_group=500,
                 compute_MAP=True, test=test, num_to_tracks=num_to_tracks, graph_name="MAP_1")

In [None]:
make_predictions(final_predictor, URM_normalize, target_playlists, target_tracks,
                 row_group=500,
                 compute_MAP=True, test=test, num_to_tracks=num_to_tracks, graph_name="MAP_2")

In [None]:
predictions = make_predictions(final_predictor, URM_normalize, target_playlists, target_tracks,
                 row_group=1000,
                 compute_MAP=False, test=train, num_to_tracks=num_to_tracks, graph_name="MAP_mmm")

In [None]:
predictions.head()

## for predictions

In [None]:
pr_copy = predictions.copy(deep=True)

In [None]:
predictions['playlist_id'] = predictions['playlist_id_tmp']

In [None]:
predictions = predictions.drop("playlist_id_tmp", axis=1)

In [None]:
predictions.head()

In [None]:
# Make the dataframe friendly for output -> convert np.array in string
predictions['track_ids'] = predictions['track_ids'].apply(lambda x : ' '.join(map(str, x)))
predictions.to_csv('results.csv', index=False)