In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.sparse import *
from scipy.sparse.linalg import svds
import math

from recsys.preprocess import *

import functools

#from recsys.utility import *

#RANDOM_STATE = 666

#np.random.seed(RANDOM_STATE)

%matplotlib inline

In [None]:
def train_test_split(train, test_size=0.3, min_playlist_tracks=7):
    """
        Standard train_test_split, no modifications.
    """
    playlists = train[train.playlist_id.isin(target_playlists_original.playlist_id)].groupby('playlist_id').count()

    # Only playlists with at least "min_playlist_tracks" tracks are considered.
    # If "min_playlists_tracks" = 7, then 28311 out of 45649 playlists in "train" are considered.
    to_choose_playlists = playlists[playlists['track_id'] >= min_playlist_tracks].index.values


    # Among these playlists, "test_size * len(to_choose_playlists)" distinct playlists are chosen for testing.
    # If "test_size" = 0.3, then 8493 playlists are chosen for testing.
    # It's a numpy array that contains playlis_ids.
    target_playlists = np.random.choice(to_choose_playlists, replace=False, size=int(test_size * len(to_choose_playlists)))

    target_tracks = np.array([])
    indexes = np.array([])
    for p in target_playlists:
        # Choose 5 random tracks of such playlist: since we selected playlists with at least "min_playlist_tracks"
        # tracks, if "min_playlist_tracks" is at least 5, we are sure to find them.
        selected_df = train[train['playlist_id'] == p].sample(5)

        selected_tracks = selected_df['track_id'].values
        target_tracks = np.union1d(target_tracks, selected_tracks)
        indexes = np.union1d(indexes, selected_df.index.values)

    test = train.loc[indexes].copy()
    train = train.drop(indexes)

    return train, test, pd.DataFrame(target_playlists, columns=['playlist_id']), pd.DataFrame(target_tracks, columns=['track_id'])


In [None]:
import numpy as np
import scipy
from scipy.sparse import *
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

def dot_with_top(m1, m2, def_rows_g, top=-1, row_group=1, similarity="dot", shrinkage=0.000001, alpha=1):
    """
        Produces the product between matrices m1 and m2.
        Possible similarities: "dot", "cosine". By default it goes on "dot".
        NB: Shrinkage is not implemented...
        Code taken from
            https://stackoverflow.com/questions/29647326/sparse-matrix-dot-product-keeping-only-n-max-values-per-result-row
            and optimized for smart dot products.
    """
    m2_transposed = m2.transpose()
    
    l2 = m2.sum(axis=0) # by cols
    
    if top > 0:
        final_rows = []
        row_id = 0
        while row_id < m1.shape[0]:
            last_row = row_id + row_group if row_id + row_group <= m1.shape[0] else m1.shape[0]
            rows = m1[row_id:last_row]
            if rows.count_nonzero() > 0:
                if similarity == "cosine-old":
                    res_rows = cosine_similarity(rows, m2_transposed, dense_output=False)
                elif similarity == "cosine":
                    res_rows = csr_matrix((np.dot(rows,m2) / (np.sqrt(rows.sum(axis=1)) * np.sqrt(l2) + shrinkage)))
                elif similarity == "cosine-asym":
                    res_rows = csr_matrix((np.dot(rows,m2) / (np.power(rows.sum(axis=1),alpha) * np.power(m2.sum(axis=0),(1-alpha)) + shrinkage)))
                elif similarity == "dot-old":
                    res_rows = rows.dot(m2)
                else:
                    res_rows = (np.dot(rows,m2) + shrinkage).toarray()
                if res_rows.count_nonzero() > 0:
                    for res_row in res_rows:
                        if res_row.nnz > top:
                            args_ids = np.argsort(res_row.data)[-top:]
                            data = res_row.data[args_ids]
                            cols = res_row.indices[args_ids]
                            final_rows.append(csr_matrix((data, (np.zeros(top), cols)), shape=res_row.shape))
                        else:
                            args_ids = np.argsort(res_row.data)[-top:]
                            data = res_row.data[args_ids]
                            cols = res_row.indices[args_ids]
                            final_rows.append(csr_matrix((data, (np.zeros(len(args_ids)), cols)), shape=res_row.shape))
                            #print("Less than top: {0}".format(len(args_ids)))
                            #final_rows.append(def_rows_g[0])
                else:
                    print("Add empty 2")
                    for res_row in res_rows:
                        final_rows.append(def_rows_g[0])
            else:
                print("Add empty 3")
                final_rows.append(def_rows_g)
            row_id += row_group
            if row_id % row_group == 0:
                print(row_id)
        return scipy.sparse.vstack(final_rows, 'csr')
    return m1.dot(m2) 

In [None]:
def make_predictions(test=None, target_playlists=None, urm=None,
                     similarities=[], playlist_params=None,
                     compute_MAP=False, row_group=100, verbose=False):
    """
        Produces a prediction dataframe for "test", where each row corresponds to a playlist in "target_playlists".
        If compute_MAP is true, then it print the MAP every "row_group" playlists.
        It's optimized for doing dot products for different playlist at once.
            "row_group" is the number of playlists in each of these optimized dot products.
            The higher is row_group, the faster are the predictions but more memory is used.
            
        All predictions are done in the following way:
            URM * (a * SYM_0 + b * SYM_1 + ...)
            
        Arguments:
            - test: needed for computing MAP
            - target_playlists: a dataframe containing the target playlists we want to predict for
            - urm: the urm used for making predictions
            - similarities: list with similarities used for making predictions
            - playlist_params: dataframe containing parameters used for doing predictions. The name comvention
                is the following: the parameter for SYM_0 is "param_0", etc...
    """
    # Create predictions dataframe
    predictions = pd.DataFrame(target_playlists)
    predictions.index = target_playlists['playlist_id']
    predictions['track_ids'] = [np.array([]) for i in range(len(predictions))]
    predictions['track_ids_not_mapped'] = [np.array([]) for i in range(len(predictions))]
    ttracks = set(target_tracks['track_id'].values)
    if compute_MAP:
        test_good = get_playlist_track_list2(test)
        test_good.index = test_good.playlist_id.apply(lambda pl_id: playlist_to_num[pl_id])
        print(len(test_good))
    
    # This is the sum of all the AP of the playlists.
    # When we print the MAP, we divide "sum_ap" by the number of considered playlists.
    sum_ap = 0
    
    # Let's start the predictions!
    row_start = 0
    while row_start < len(target_playlists):
        # We'll do dot products for all playlists in "target_playlists" from "row_start" to "row_end"
        row_end = row_start + row_group if row_start + row_group <= len(target_playlists) else len(target_playlists)
        
        # "pl_group" is the set of the playlists that we want to make prediction for
        pl_group = target_playlists[row_start:row_end]
        
        # Now we need to build a matrix where, for each playlist in "pl_group", we take the correspondent URM row slice
        rows_URM = []
        for pl_id in pl_group.playlist_id:
            rows_URM += [urm[pl_id,:]]
        composed_URM = scipy.sparse.vstack(rows_URM, 'csr')
        
        # Compute predictions for current playlist group: here we do all the smart dot products...
        simil_ar = []
        for SYM in similarities:
            simil_ar.append(np.array(np.divide(SYM.dot(composed_URM.transpose()).transpose().todense(), SYM.sum(axis=1).transpose() + 1)))
            #simil_ar.append(np.array(SYM.dot(composed_URM.transpose()).transpose().todense()))
            #simil_ar.append(np.array(cosine_similarity(SYM, composed_URM).transpose()))
                            
        # Now we should consider one playlist at a time, take its own personalized parameters and make the prediction
        for i,pl_id in enumerate(pl_group.playlist_id):
            # Tracks that we know are in the playlist (so we shouldn't recommend them)
            pl_tracks = set(playlist_tracks.loc[pl_id]['track_ids'])
            
            # Retrieve parameters
            params = []
            for it,SYM in enumerate(similarities):
                params.append(playlist_params.loc[pl_id]["param_" + str(it)])

            simil = params[0] * simil_ar[0][i]
            for p in range(1,len(simil_ar)):
                simil += params[p] * simil_ar[p][i]
            sorted_ind = simil.argsort()[::-1]

            # Predict...
            pred_not_mapped = []
            pred = []
            i = 0
            while i < len(sorted_ind) and len(pred) < 5:
                tr = sorted_ind[i]
                if (tr in ttracks) and (tr not in pl_tracks) and (num_to_tracks[tr] not in pred):
                    pred_not_mapped.append(tr)
                    pred.append(num_to_tracks[tr])
                i+=1
            
            predictions.loc[pl_id] = predictions.loc[pl_id].set_value('track_ids_not_mapped', np.array(pred_not_mapped))
            predictions.loc[pl_id] = predictions.loc[pl_id].set_value('track_ids', np.array(pred))
            
            # Update MAP
            if compute_MAP:
                correct = 0
                ap = 0
                for it, t in enumerate(pred):
                    tr_ids = test_good.loc[pl_id]['track_ids']
                    if t in tr_ids:
                        correct += 1
                        ap += correct / (it+1)
                ap /= len(pred)
                sum_ap += ap
        
        # Update "row_start" to "row_end" and proceed to next pl_group
        row_start = row_end
        
        print(row_start)
        if compute_MAP:
            print(sum_ap / row_start)
            
    return predictions

In [None]:
def from_num_to_id(df, row_num, column = 'track_id'):
    """ df must have a 'track_id' column """
    return df.iloc[row_num][column]

def from_id_to_num(df, tr_id, column='track_id'):
    """ df must have a 'track_id' column """
    return np.where(df[column].values == tr_id)[0][0]

# Read data

In [None]:
train = pd.read_csv('data/train_final.csv', delimiter='\t')
playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
target_tracks = pd.read_csv('data/target_tracks.csv', delimiter = '\t')
tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

In [None]:
# We load them just to compare the ones for testing with the original ones.
# NB: we shouldn't use them in training!
train_original = pd.read_csv('data/train_final.csv', delimiter='\t')
target_playlists_original = pd.read_csv('data/target_playlists.csv', delimiter='\t')

In [None]:
len(train), len(target_playlists), len(target_tracks)

In [None]:
train, test, target_playlists, target_tracks = train_test_split(train, test_size=1, min_playlist_tracks=13)

In [None]:
len(train), len(test), len(target_playlists), len(target_tracks)

# Process data

In [None]:
# Almost all of these were taken from one of your notebook, so you probably understand them
tracks['track_id_tmp'] = tracks['track_id']

tracks['track_id'] = tracks.index

playlists['playlist_id_tmp'] = playlists['playlist_id']
playlists['playlist_id'] = playlists.index

train['playlist_id_tmp'] = train['playlist_id']
train['track_id_tmp'] = train['track_id']

track_to_num = pd.Series(tracks.index)
track_to_num.index = tracks['track_id_tmp']

playlist_to_num = pd.Series(playlists.index)
playlist_to_num.index = playlists['playlist_id_tmp']

num_to_tracks = pd.Series(tracks['track_id_tmp'])

train['track_id'] = train['track_id'].apply(lambda x : track_to_num[x])
train['playlist_id'] = train['playlist_id'].apply(lambda x : playlist_to_num[x])

tracks.tags = tracks.tags.apply(lambda s: np.array(eval(s), dtype=int))

playlists.title = playlists.title.apply(lambda s: np.array(eval(s), dtype=int))

target_playlists['playlist_id_tmp'] = target_playlists['playlist_id']
target_playlists['playlist_id'] = target_playlists['playlist_id'].apply(lambda x : playlist_to_num[x])

target_tracks['track_id_tmp'] = target_tracks['track_id']
target_tracks['track_id'] = target_tracks['track_id'].apply(lambda x : track_to_num[x])

# Create a dataframe that maps a playlist to the set of its tracks
playlist_tracks = pd.DataFrame(train['playlist_id'].drop_duplicates())
playlist_tracks.index = train['playlist_id'].unique()
playlist_tracks['track_ids'] = train.groupby('playlist_id').apply(lambda x : x['track_id'].values)
playlist_tracks = playlist_tracks.sort_values('playlist_id')

# Create a dataframe that maps a track to the set of the playlists it appears into
track_playlists = pd.DataFrame(train['track_id'].drop_duplicates())
track_playlists.index = train['track_id'].unique()
track_playlists['playlist_ids'] = train.groupby('track_id').apply(lambda x : x['playlist_id'].values)
track_playlists = track_playlists.sort_values('track_id')

# Substitute each bad album (i.e. an illformed album such as -1, None, etc) with the 0 album
bad_albums = 0
def transform_album_1(alb):
    global bad_albums
    ar = eval(alb)
    if len(ar) == 0 or (len(ar) > 0 and (ar[0] == None or ar[0] == -1)):
        ar = [0]
        bad_albums += 1
    return ar[0]

tracks.album = tracks.album.apply(lambda alb: transform_album_1(alb))

## Recover albums
Choose one of the following:<br>
1 - fill with most similar albums according to the URM<br>
2 - fill with brand new albums 

#### Fill with most similar albums according to the URM

In [None]:
def get_UAM_album(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_albums = tracks.album.unique()
    
    i = 0

    UAM_album = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    UAM_album_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_albums)+1))
    album_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            alb = tracks.loc[tr_id].album
            UAM_album[pl_id,alb] += 1
            UAM_album_no_norm[pl_id,alb] += 1
            if alb not in album_to_playlists:
                album_to_playlists[alb] = [pl_id]
            else:
                album_to_playlists[alb].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    album_to_val = {}
    if norm == "okapi" or norm == "idf" or norm == "tf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)
        
        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            albums = UAM_album.rows[pl_id]
            data = UAM_album.data[pl_id]
            for album in albums:
                fq = UAM_album[pl_id,album]
                nq = len(album_to_playlists[album])
                idf = math.log(500/(nq + 0.5))
                
                if album not in album_to_val:
                    album_to_val[album] = idf
                    
                if norm == "idf":
                    UAM_album[pl_id,album] = idf
                elif norm == "okapi":
                    UAM_album[pl_id,album] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
                elif norm == "tf":
                    UAM_album[pl_id,album] = (fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM_album, UAM_album_no_norm, album_to_val

In [None]:
# Substitute each album with the most similar album according to playlist frequencies
UAM_album, UAM_album_no_norm, album_to_val = get_UAM_album(tracks, playlist_tracks, target_playlists, norm="idf")

In [None]:
tracks.tail()

In [None]:
tracks["album_corrected"] = tracks["album"]

In [None]:
tracks.tail()

In [None]:
def transform_album_sim(tr_id):
    tot = np.zeros((1,max(tracks.album)+1))[0]
    for pl_id in track_playlists.loc[tr_id].playlist_ids:
        ar = UAM_album_no_norm[pl_id].toarray()[0]
        tot += np.log(ar + 1)  
        #tot += ar.clip(max=1)
    if tot.max() != 0:
        best_1 = tot.argmax()
        best_2 = tot.argpartition(len(tot)-2)[-2]
        if best_1 == 0:
            return best_2
    return 0

corrected_albums = 0
for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
    if row.album_corrected == 0:
        new_album = transform_album_sim(row.track_id)
        if new_album != 0:
            tracks.set_value(row.track_id, "album_corrected", new_album)
            corrected_albums += 1
            if corrected_albums % 100 == 0:
                print(corrected_albums)

In [None]:
bad_albums, corrected_albums

In [None]:
tracks.tail()

In [None]:
len(tracks[tracks.album == 0])

#### Fill with brand new albums

In [None]:
# Substitute each 0 album with a brand new album
def transform_album_2(alb):
    global next_album_id
    if alb == 0:
        alb = next_album_id
        next_album_id += 1
    return alb
last_album = tracks.album.max()
next_album_id = last_album + 1
tracks.album = tracks.album.apply(lambda alb: transform_album_2(alb))

In [None]:
len(tracks[tracks.album == 0])

## Recover tags according to URM

In [None]:
tracks["tags_corrected"] = tracks["tags"]

In [None]:
# Count distinct tags
tag_tracks = {}
for row in tracks.itertuples():
    for tag in row.tags:
        if tag in tag_tracks:
            tag_tracks[tag].append(row.track_id)
        else:
            tag_tracks[tag] = [row.track_id]

In [None]:
# User Tag Matrix UTM
def get_UTM(tracks, playlist_tracks, tag_tracks, norm="no", OKAPI_K=1.7, OKAPI_B=0.75, best_tag=False):
    """
        Possible norm are "no", "okapi", "idf", "tf". Default to "no".
    """
    
    if best_tag:
        unique_tags = list(best_tag_tracks.keys())
    else:
        unique_tags = list(tag_tracks.keys())
    
    i = 0

    UTM = lil_matrix((max(playlists.playlist_id)+1, max(unique_tags)+1))
    UTM_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_tags)+1))
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            tr_row = tracks.loc[tr_id]
            if best_tag:
                UTM[pl_id,tr_row.best_tag] += 1
                UTM_no_norm[pl_id,tr_row.best_tag] += 1
            else:
                for tag in tr_row.tags:
                    UTM[pl_id,tag] += 1
                    UTM_no_norm[pl_id,tag] += 1
                
        i += 1
        if i % 1000 == 0:
            print(i)
            
    if norm == "okapi" or norm == "idf" or norm == "tf":
        avg_document_length = sum(list(map(lambda l: sum(l), UTM.data)))/len(UTM.data)

        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            tags = UTM.rows[pl_id]
            data = UTM.data[pl_id]
            for tag in tags:
                fq = UTM[pl_id,tag]
                if best_tag:
                    nq = len(best_tag_tracks[tag])
                else:
                    nq = len(tag_tracks[tag])
                idf = math.log(28000/(nq + 0.5))
                
                if norm == "idf":
                    UTM[pl_id,tag] = idf
                elif norm == "okapi":
                    UTM[pl_id,tag] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
                elif norm == "tf":
                    UTM[pl_id,tag] = (fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
                    
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UTM, UTM_no_norm

In [None]:
UTM, UTM_no_norm = get_UTM(tracks, playlist_tracks, tag_tracks, norm="okapi", best_tag=False)

In [None]:
def get_tags_sim(tr_id):
    tot = csr_matrix((1,max(tag_tracks)+1))
    tr_row = track_playlists.loc[tr_id]
    for pl_id in tr_row.playlist_ids:
        tot += UTM[pl_id]
    tot = tot.toarray()[0]
    return tot.argsort()[::-1][0:5]
    

corrected_tags = 0
for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
    if len(row.tags) == 0:
        new_tags = get_tags_sim(row.track_id)
        tracks.set_value(row.track_id, "tags", new_tags)
        
        corrected_tags += 1
        if corrected_tags % 100 == 0:
            print(corrected_tags)

In [None]:
tracks.tail()

# "Training"

## Item-item similarity using only URM

In [None]:
def sigmoid(gamma):
    if gamma < 0:
        return 1 - 1/(1 + math.exp(gamma))
    else:
        return 1/(1 + math.exp(-gamma))

# User Rating Matrix URM
def get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no", pow_base=500, pow_exp=0.15):
    """
        possible normalizations: "no", "idf", "sqrt", "pow", "atan".
        Default "no".
    """
    URM = lil_matrix((len(playlists), len(tracks)))
    num_playlists = len(playlist_tracks)

    i = 0
    
    for row in track_playlists.itertuples():
        track_id = row.track_id
        nq = len(row.playlist_ids)
        for pl_id in row.playlist_ids:
            if norm == "idf":
                URM[pl_id,track_id] = math.log((500)/nq)
            elif norm == "sqrt":
                URM[pl_id,track_id] = math.sqrt((500)/nq)
            elif norm == "pow":
                URM[pl_id,track_id] = math.pow((pow_base)/nq, pow_exp)
            elif norm == "atan":
                URM[pl_id,track_id] = 3 + 1*math.atan(-0.1*nq + 1)
            else:
                URM[pl_id,track_id] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
    
    return URM

#
# URM:
# 
#              tracks
#            _________
#           \         \
# playlists \         \
#           \_________\
#

In [None]:
URM_normalize = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="no")

In [None]:
URM_pow = get_URM(tracks, playlists, playlist_tracks, track_playlists, norm="pow", pow_base=500, pow_exp=0.15)

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, URM_normalize.shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
TTM_cosine = dot_with_top(URM_normalize.transpose(), URM_normalize, def_rows_i, top=20, row_group=row_group, similarity="cosine-old")

In [None]:
row_group = 1000
def_rows_i = csr_matrix((row_group, URM_pow.shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
TTM_dot = dot_with_top(URM_pow.transpose(), URM_pow, def_rows_i, top=20, row_group=row_group, similarity="dot-old")

## Item-item similarity starting from a user-user similarity using only the URM

In [None]:
row_group = 1000
def_rows_i = csr_matrix((row_group, URM_normalize.transpose().shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
UUM_cosine = dot_with_top(URM_normalize, URM_normalize.transpose(), def_rows_i, top=200, row_group=row_group, similarity="cosine-old")

In [None]:
row_group = 1000
def_rows_i = csr_matrix((row_group, UUM_cosine.transpose().shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
URM_UUM_cosine = dot_with_top(UUM_cosine, URM_normalize, def_rows_i, top=500, row_group=row_group, similarity="cosine-old")

In [None]:
row_group = 1000
def_rows_i = csr_matrix((row_group, URM_UUM_cosine.shape[1]))#URM_pow.transpose()[0:row_group].dot(URM_pow) # this is needed to fill some rows that would be all zeros otherwise...
TTM_UUM_cosine = dot_with_top(URM_UUM_cosine.transpose(), URM_UUM_cosine, def_rows_i, top=20, row_group=row_group, similarity="cosine-old")

In [None]:
"""
# Calibration
def calibrate_predictions(pred, theta=0.5):
    max_r = np.amax(pred, axis=0)
    mean_r = np.mean(pred, axis=0)

    pred_coo = pred.tocoo()
    pred_csr = pred.tocsr()
    max_r_csr = max_r.tocsr()

    counter = 0
    for i,j,v in zip(pred_coo.row, pred_coo.col, pred_coo.data):
        if v >= max_r_csr[0,j]:
            pred_csr[i,j] = 1
        elif v >= mean_r[0,j]:
            pred_csr[i,j] = theta + (1 - theta)*((v - mean_r[0,j])/(max_r_csr[0,j] - mean_r[0,j]))
        else:
            pred_csr[i,j] = theta * v / mean_r[0,j]
        counter += 1
        if counter % 10000 == 0:
            print("{0} out of {1}".format(counter, len(pred.data)))
    
    return pred_csr"""

## Album

<div style="white-space: pre-wrap;">
Steps:
1 - Compute the playlists_x_albums (i.e. the UAM_album matrix, where U stands for User) sparse matrix. I do this before computing the tracks_x_albums (i.e. the IAM_album matrix, where I stands for Item) sparse matrix because here I compute also the "album_to_val" dictionary, which contains the IDF value of each album obtained considering the playlists as document (and not the tracks). However at the moment I don't use this because I compute the IAM_album matrix without any normalization, so you may skip it...
2 - Compute the tracks_x_albums IAM_album sparse matrix.
3 - Compute the SYM_ALBUM tracks_x_tracks matrix by doing IAM_album.dot(IAM_album.transpose()). It's not big, so I don't need to keep the K best values...
4 - Compute the album_parameter, which means "how much each playlist is affine to album similarity". I do this by computing the entropy of the numpy array containing the occurrences of the albums in the playlist, and then doing 1/(entropy_of_array + 0.05).
</div>

In [None]:
unique_albums = tracks.album.unique()
unique_albums

In [None]:
album_tracks = {}
for row in tracks.itertuples():
    if row.album in album_tracks:
        album_tracks[row.album].append(row.track_id)
    else:
        album_tracks[row.album] = [row.track_id]

In [None]:
def get_IAM_album(tracks, target_tracks, norm="no", most_similar=5):
    """
        Possible norms are "no", "idf", "most-similar".
        Default "no".
    """
    unique_albums = tracks.album.unique()
    IAM_album = lil_matrix((len(tracks), max(unique_albums)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    if norm == "most-similar":
        def get_album_sim(alb, n_best=5):
            bests = []
            a = ALB_ALB_SYM[alb].toarray()[0]
            for i in range(n_best):
                bests.append(a.argpartition(len(a)-1-i)[-1-i])
            return bests

        for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
            bests = get_album_sim(row.album, n_best=5)
            for it,alb in enumerate(bests):
                IAM_album[row.track_id, alb] = 1 - it*0.1
            if i % 100 == 0:
                print(i)
            i += 1
            
    else:
        for row in tracks.itertuples():
            nq = 1
            if norm == "idf":
                nq = len(album_tracks[row.album])
                if row.album in album_to_val:
                    IAM_album[row.track_id,row.album] = math.log(500/(nq + 0.5))
                else:
                    IAM_album[row.track_id,row.album] = 0 # Give zero if the album is not in any playlist!
            else:
                IAM_album[row.track_id,row.album] = 1
            if i % 100 == 0:
                print(i)
            i += 1
    
    return IAM_album

In [None]:
# Step 2
IAM_album = get_IAM_album(tracks, target_tracks, norm="no")

In [None]:
SYM_ALBUM = IAM_album.dot(IAM_album.transpose())

## Artist
Same steps as for Album

In [None]:
unique_artists = tracks.artist_id.unique()

In [None]:
# Item Artist Matrix
def get_IAM(tracks, target_tracks, norm="no", n_best=5):
    """
        Possible norms are "no", "idf", "most-similar". Default to "no".
    """
    unique_artists = tracks.artist_id.unique()
    IAM = lil_matrix((len(tracks), max(unique_artists)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    if norm == "most-similar":
        def get_artist_sim(art, n_best=5):
            bests = []
            a = ART_ART_SYM[art].toarray()[0]
            for i in range(n_best):
                bests.append(a.argpartition(len(a)-1-i)[-1-i])
            return bests

        for row in tracks[tracks.track_id.isin(track_playlists.track_id)].itertuples():
            bests = get_artist_sim(row.artist_id, n_best=5)
            for it,art in enumerate(bests):
                IAM[row.track_id, art] = 1 - it*0.1
            if i % 100 == 0:
                print(i)
            i += 1
    else:
        for row in tracks.itertuples():
            if norm == "idf":
                if row.artist_id in artist_to_val:
                    IAM[row.track_id,row.artist_id] = artist_to_val[row.artist_id]
                else:
                    IAM[row.track_id,row.artist_id] = 0 # Give zero if the album is not in any playlist!
            else:
                IAM[row.track_id,row.artist_id] = 1

            if i % 1000 == 0:
                print(i)
            i += 1
    
    return IAM

In [None]:
# Step 2
IAM = get_IAM(tracks, target_tracks, norm="no")

In [None]:
# Step 3
SYM_ARTIST = IAM.dot(IAM.transpose())

# Tags

In [None]:
# Item Tag Matrix ITM
def get_ITM(tracks, tag_tracks, norm="no", best_tag=False):
    """
        Possible norm are "no", "sqrt", okapi". Default to "no".
    """
    if best_tag:
        unique_tags = list(best_tag_tracks.keys())
    else:
        unique_tags = list(tag_tracks.keys())
    ITM = lil_matrix((len(tracks), max(unique_tags)+1))
    
    num_tracks = len(tracks)
    i = 0
    
    if best_tag:
        tag_dict = best_tag_tracks
    else:
        tag_dict = tag_tracks
        
    for tag,track_ids in tag_dict.items():
        nq = len(track_ids)
        for track_id in track_ids:
            if norm == "okapi":
                ITM[track_id,tag] = math.log((num_tracks - nq + 0.5)/(nq + 0.5))
            elif norm == "sqrt":
                ITM[track_id,tag] = math.sqrt((num_tracks - nq + 0.5)/(nq + 0.5))
            else:
                ITM[track_id,tag] = 1
        if i % 1000 == 0:
            print(i)
        i += 1
    
    return ITM

In [None]:
ITM = get_ITM(tracks, tag_tracks, norm="no", best_tag=False)

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, ITM.shape[0])) # this is needed to fill some rows that would be all zeros otherwise...
SYM_TAG = dot_with_top(ITM, ITM.transpose(), def_rows_i, top=25, row_group=row_group, similarity="cosine-old")

# Other similarities...

In [None]:
# Step 1
UAM_album, UAM_album_no_norm, album_to_val = get_UAM_album(tracks, playlist_tracks, target_playlists, norm="no")

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, UAM_album.shape[0]))#IAM_album[0:row_group].dot(UAM_album.transpose()) # this is needed to fill some rows that would be all zeros otherwise...
TR_PL_ALBUM = dot_with_top(IAM_album, UAM_album.transpose(), def_rows_i, top=10, row_group=row_group, similarity="cosine-old")

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, TR_PL_ALBUM.shape[0]))#TR_PL_ALBUM[0:row_group].dot(TR_PL_ALBUM.transpose()) # this is needed to fill some rows that would be all zeros otherwise...
SYM_ALBUM_COMPLEX = dot_with_top(TR_PL_ALBUM, TR_PL_ALBUM.transpose(), def_rows_i, top=10, row_group=row_group, similarity="cosine-old")

In [None]:
# User Artist Matrix UAM
def get_UAM(tracks, playlist_tracks, target_playlists, norm="no", OKAPI_K=1.7, OKAPI_B=0.75):
    """
        Possible norms are "no", "idf", okapi". Default to "no".
    """
    
    unique_artists = tracks.artist_id.unique()
    
    i = 0

    UAM = lil_matrix((max(playlists.playlist_id)+1, max(unique_artists)+1))
    UAM_no_norm = lil_matrix((max(playlists.playlist_id)+1, max(unique_artists)+1))
    artist_to_playlists = {}
    
    for row in playlist_tracks.itertuples():
        pl_id = row.playlist_id
        for tr_id in row.track_ids:
            art = tracks.loc[tr_id].artist_id
            UAM[pl_id,art] += 1
            UAM_no_norm[pl_id,art] += 1
            if art not in artist_to_playlists:
                artist_to_playlists[art] = [pl_id]
            else:
                artist_to_playlists[art].append(pl_id)
                
        i += 1
        if i % 1000 == 0:
            print(i)
    
    artist_to_val = {}
    if norm == "okapi" or norm == "idf":
        avg_document_length = functools.reduce(lambda acc,tr_ids: acc + len(tr_ids), playlist_tracks.track_ids, 0) / len(playlist_tracks)
        N = len(playlist_tracks)

        i = 0

        for row in playlist_tracks.itertuples():
            pl_id = row.playlist_id
            artists = UAM.rows[pl_id]
            data = UAM.data[pl_id]
            for artist in artists:
                fq = UAM[pl_id,artist]
                nq = len(artist_to_playlists[artist])
                idf = math.log((N - nq + 0.5)/(nq + 0.5))
                
                if artist not in artist_to_val:
                    artist_to_val[artist] = idf
                
                if norm == "idf":
                    UAM[pl_id,artist] = idf
                else:
                    UAM[pl_id,artist] = idf*(fq*(OKAPI_K+1))/(fq + OKAPI_K*(1 - OKAPI_B + OKAPI_B * sum(data) / avg_document_length))
            i += 1
            if i % 1000 == 0:
                print(i)
    
    return UAM, UAM_no_norm, artist_to_val

In [None]:
# Step 1
UAM, UAM_no_norm, artist_to_val = get_UAM(tracks, playlist_tracks, target_playlists, norm="no")

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, UAM.shape[0]))#IAM[0:row_group].dot(UAM.transpose()) # this is needed to fill some rows that would be all zeros otherwise...
TR_PL_ARTIST = dot_with_top(IAM, UAM.transpose(), def_rows_i, top=10, row_group=row_group, similarity="cosine")

In [None]:
# Step 2: produce item-item matrix with cosine similarity
row_group = 1000
def_rows_i = csr_matrix((row_group, TR_PL_ARTIST.shape[0]))#TR_PL_ARTIST[0:row_group].dot(TR_PL_ARTIST.transpose()) # this is needed to fill some rows that would be all zeros otherwise...
SYM_ARTIST_COMPLEX = dot_with_top(TR_PL_ARTIST, TR_PL_ARTIST.transpose(), def_rows_i, top=10, row_group=row_group, similarity="cosine")

# Prepare matrices for schwiftyness!
Yoooo we're gonna get schwiftyyyyy: print all the info necessaries for get_schwifty.cpp to work...

In [None]:
def print_similarity(matrix_to_print, directory, filename):
    matrix_to_print = matrix_to_print.tocoo()

    file = open(directory + "/" + filename + ".txt","w") 

    rows = matrix_to_print.row;
    cols = matrix_to_print.col;
    data = matrix_to_print.data;

    file.write("{0} {1}\n".format(matrix_to_print.shape[0], matrix_to_print.shape[1]))

    for i in range(0,len(rows)):
        file.write("{0} ".format(rows[i]))
    file.write("\n")

    for i in range(0,len(cols)):
        file.write("{0} ".format(cols[i]))
    file.write("\n")

    for i in range(0,len(data)):
        file.write("{0} ".format(data[i]))
    file.write("\n")

    file.close() 

In [None]:
def print_URM(urm, directory, filename):
    file = open(directory + "/" + filename + ".txt","w")
    
    urm_coo = urm.tocoo()

    rows = urm_coo.row;
    cols = urm_coo.col;
    data = urm_coo.data;

    file.write("{0} {1}\n".format(urm_coo.shape[0], urm_coo.shape[1]))

    for i in range(0,len(rows)):
        file.write("{0} ".format(rows[i]))
    file.write("\n")

    for i in range(0,len(rows)):
        file.write("{0} ".format(cols[i]))
    file.write("\n")

    for i in range(0,len(rows)):
        file.write("{0} ".format(data[i]))
    file.write("\n")
    
    for pl_id in target_playlists.playlist_id:
        file.write("{0} ".format(pl_id))
    file.write("\n")

    file.close() 

In [None]:
def print_test(urm, directory, filename):
    file = open(directory + "/" + filename + ".txt","w")
    
    urm_coo = urm.tocoo()

    rows = urm_coo.row;
    cols = urm_coo.col;

    file.write("{0} {1}\n".format(urm_coo.shape[0], urm_coo.shape[1]))

    for i in range(0,len(rows)):
        file.write("{0} ".format(rows[i]))
    file.write("\n")

    for i in range(0,len(rows)):
        file.write("{0} ".format(cols[i]))
    file.write("\n")

    file.close() 

In [None]:
def print_target_playlists(target_playlists, directory, filename):
    file = open(directory + "/" + filename + ".txt","w")
    
    file.write("{0}\n".format(len(target_playlists)))
    for pl_id in target_playlists.playlist_id:
        file.write("{0} ".format(pl_id))
    file.write("\n")
    
    file.close() 

In [None]:
def print_target_tracks(target_tracks, directory, filename):
    file = open(directory + "/" + filename + ".txt","w")
    
    file.write("{0}\n".format(len(target_tracks)))
    for tr_id in target_tracks.track_id:
        file.write("{0} ".format(tr_id))
    file.write("\n")
    
    file.close() 

In [None]:
schwifty_directory = "test1"

In [None]:
print_URM(URM_pow, schwifty_directory, "tracks_in_playlist")

In [None]:
"""test['playlist_id_tmp'] = test['playlist_id']
test['track_id_tmp'] = test['track_id']
test['track_id'] = test['track_id'].apply(lambda x : track_to_num[x])
test['playlist_id'] = test['playlist_id'].apply(lambda x : playlist_to_num[x])"""

In [None]:
"""
# Create a dataframe that maps a playlist to the set of its tracks
playlist_tracks_test = pd.DataFrame(test['playlist_id'].drop_duplicates())
playlist_tracks_test.index = test['playlist_id'].unique()
playlist_tracks_test['track_ids'] = test.groupby('playlist_id').apply(lambda x : x['track_id'].values)
playlist_tracks_test = playlist_tracks_test.sort_values('playlist_id')

# Create a dataframe that maps a track to the set of the playlists it appears into
track_playlists_test = pd.DataFrame(test['track_id'].drop_duplicates())
track_playlists_test.index = test['track_id'].unique()
track_playlists_test['playlist_ids'] = test.groupby('track_id').apply(lambda x : x['playlist_id'].values)
track_playlists_test = track_playlists_test.sort_values('track_id')

URM_test = get_URM(tracks, playlists, playlist_tracks_test, track_playlists_test, norm="no")"""

In [None]:
#print_test(URM_test, schwifty_directory, "test")

In [None]:
#print_target_tracks(target_tracks, schwifty_directory, "target_tracks")

In [None]:
#print_target_playlists(target_playlists, schwifty_directory, "target_playlists")

In [None]:
TTM_cosine.data = np.power(TTM_cosine.data, 0.75)
print_similarity(TTM_cosine, schwifty_directory, "similarity_0")

In [None]:
TTM_dot.data = (np.power(TTM_dot.data, 0.18) - 1)
print_similarity(TTM_dot, schwifty_directory, "similarity_1")

In [None]:
print_similarity(TTM_UUM_cosine, schwifty_directory, "similarity_2")

In [None]:
print_similarity(SYM_ALBUM, schwifty_directory, "similarity_3")

In [None]:
print_similarity(SYM_ARTIST, schwifty_directory, "similarity_4")

In [None]:
from pandas import Series

def load_playlist_params(location, params_bitmask):
    content = None
    with open(os.path.join(location, 'playlist_params.txt'), 'r') as f:
        content = f.readlines()

    playlist_params = pd.DataFrame(playlists.playlist_id)
    p = 0
    for it,ch in enumerate(params_bitmask):
        param_name = "param_" + str(it)
        if ch == "1":
            p_list = list(map(float, content[p].strip().split(' ')))
            playlist_params[param_name] = Series(data=p_list, index=playlist_params.index)
            p += 1
        else:
            playlist_params[param_name] = 0

    return playlist_params

In [None]:
from subprocess import call

In [None]:
call(["./get_schwifty", "test1", "11111", "adadelta", "500", "0.9", "0.25", "100"])

In [None]:
playlist_params = load_playlist_params("test1", "11111")
playlist_params[playlist_params.playlist_id.isin(target_playlists.playlist_id)]

In [None]:
similarities = [TTM_cosine, TTM_dot, TTM_UUM_cosine, SYM_ALBUM, SYM_ARTIST]

predictions = make_predictions(test=train, target_playlists=target_playlists, urm=URM_pow,
                 similarities=similarities, playlist_params=playlist_params,
                 compute_MAP=False, row_group=1000, verbose=False)

In [None]:
pr_copy = predictions.copy(deep=True)

In [None]:
predictions['playlist_id'] = predictions['playlist_id_tmp']

In [None]:
predictions = predictions.drop("playlist_id_tmp", axis=1)
predictions = predictions.drop("track_ids_not_mapped", axis=1)

In [None]:
predictions.head()

In [None]:
# Make the dataframe friendly for output -> convert np.array in string
predictions['track_ids'] = predictions['track_ids'].apply(lambda x : ' '.join(map(str, x)))
predictions.to_csv('results.csv', index=False)