In [70]:
import json
import sys
import time
import codecs
import logging
import pandas as pd
import numpy as np
import keras
import scipy.sparse as sp
import random, os, datetime, math
from keras.models import load_model
from sklearn.cluster import KMeans
from keras import backend as K
from keras import initializers
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Multiply, Reshape, Flatten, Concatenate
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.regularizers import l2

In [71]:
def read_data():
    """ Reads in the MPD dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/pid/track_count """

    start = 0
    numFiles = 10
    songPlaylistArray = []
    while numFiles:
        path = 'data/mpd.slice.' + str(start) + "-" + str(start+999) + '.json'
        d = json.load(open(path, 'r'))
        thisSlice = pd.DataFrame.from_dict(d['playlists'], orient='columns')
        for index, row in thisSlice.iterrows():
            for track in row['tracks']:
                songPlaylistArray.append([track['track_uri'], track['artist_name'], track['track_name'], row['pid']])
        start += 1000
        numFiles = numFiles - 1
    #converting playlist level to track level
    songPlaylist = pd.DataFrame(songPlaylistArray, columns=['trackid', 'artist_name', 'track_name', 'pid'])
    songPlaylist['trackindex'] = songPlaylist['trackid'].astype('category').cat.codes
    songPlaylist['cat_pid'] = songPlaylist['pid'].astype('category').cat.codes   
    
    t = json.load(open('data/challenge_set.json'))
    challenge_df = pd.DataFrame.from_dict(t['playlists'], orient='columns')
    challenge_df['pidcat'] = challenge_df['pid'].astype('category').cat.codes
    m = len(songPlaylist['cat_pid'].unique())
    n = len(songPlaylist['trackindex'].unique())
    mat = sp.dok_matrix((m, n), dtype=np.float32)
    for pid, trackindex in zip(songPlaylist['cat_pid'], songPlaylist['trackindex']):
        mat[pid, trackindex] = 1.0
    
    return songPlaylist,challenge_df, mat




In [40]:
songPlaylist, challenge_df, mat = read_data()

In [41]:
#checking if all the playlists from the challenge set are in the training set
c = set(challenge_df['pidcat'])

s = set(songPlaylist['cat_pid'])

c.issubset(s)

True

In [47]:
songPlaylist['cat_pid'].unique().shape

(40000,)

In [48]:
songPlaylist['trackindex'].unique().shape

(400817,)

In [50]:
# full NCF model
def get_model(num_users, num_items, latent_dim=8, dense_layers=[64, 32, 16, 8],
              reg_layers=[0, 0, 0, 0], reg_mf=0):

    # input layer
    input_user = Input(shape=(1,), dtype='int32', name='user_input')
    input_item = Input(shape=(1,), dtype='int32', name='item_input')
    
    # embedding layer
    mf_user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim,
                        name='mf_user_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mf_item_embedding = Embedding(input_dim=num_items, output_dim=latent_dim,
                        name='mf_item_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mlp_user_embedding = Embedding(input_dim=num_users, output_dim=int(dense_layers[0]/2),
                         name='mlp_user_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=int(dense_layers[0]/2),
                         name='mlp_item_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)

    # MF latent vector
    mf_user_latent = Flatten()(mf_user_embedding(input_user))
    mf_item_latent = Flatten()(mf_item_embedding(input_item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP latent vector
    mlp_user_latent = Flatten()(mlp_user_embedding(input_user))
    mlp_item_latent = Flatten()(mlp_item_embedding(input_item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])
    
    mlp_vector = mlp_cat_latent
    
    # build dense layer for model
    for i in range(1,len(dense_layers)):
        layer = Dense(dense_layers[i],
                      activity_regularizer=l2(reg_layers[i]),
                      activation='relu',
                      name='layer%d' % i)
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])
    result = Dense(1, activation='sigmoid', 
                   kernel_initializer='lecun_uniform',name='result')

    model = Model(inputs=[input_user,input_item], outputs=result(predict_layer))

    return model

# get the training samples
def get_train_samples(train_mat, num_negatives):
    user_input, item_input, labels = [], [], []
    num_user, num_item = train_mat.shape
    for (u, i) in train_mat.keys():
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_item)
            while (u, j) in train_mat.keys():
                j = np.random.randint(num_item)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

# hyperparameters
loaded = True
verbose = 1
epochs = 3   # small is due to time constraints
batch_size = 256
latent_dim = 8
dense_layers = [64, 32, 16, 8]
reg_layers = [0, 0, 0, 0]
reg_mf = [0]
num_negatives = 4
learning_rate = 0.001
learner = 'adam'
dataset = 'spotify'

# loading data
if loaded:
    train_mat = mat
else:
    train_mat = sp.load_npz('spotify_train_matrix.npz')
    
num_users, num_items = train_mat.shape
print('Done loading data!')

Done loading data!


In [51]:
num_items

400817

In [55]:
%%time
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=2)
# get model
model = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf)
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
    
# train model
# generate training instances
user_input, item_input, labels = get_train_samples(train_mat, num_negatives)

# training
hist = model.fit([np.array(user_input), np.array(item_input)], np.array(labels), 
                 batch_size=batch_size, epochs=epochs, verbose=verbose, shuffle=True, callbacks=[es])



Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
mlp_user_embedding (Embedding)  (None, 1, 32)        1280000     user_input[0][0]                 
__________________________________________________________________________________________________
mlp_item_embedding (Embedding)  (None, 1, 32)        12848960    item_input[0][0]                 
____________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
Epoch 2/3




Epoch 3/3
Wall time: 49min 36s


In [5]:
#save model file
model_file = '%s_NCF_%d_%s.h5' % (dataset, latent_dim, str(dense_layers))
model.save(model_file, overwrite=True)

In [130]:
q = json.load(open('data/challenge_set.json'))
songPlaylistArray1 = []
thisSlice = pd.DataFrame.from_dict(q['playlists'], orient='columns')
for index, row in thisSlice.iterrows():
    for track in row['tracks']:
        songPlaylistArray1.append([track['track_uri'],row['pid']])
songPlaylist1 = pd.DataFrame(songPlaylistArray1, columns=['trackid', 'pid'])

In [137]:
songPlaylist1['pidcat'] = songPlaylist1['pid'].astype('category').cat.codes

In [183]:
songPlaylist1.head(20)

Unnamed: 0,trackid,pid,pidcat
0,spotify:track:66U0ASk1VHZsqIkpMjKX3B,1000000,0
1,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,1000000,0
2,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,1000000,0
3,spotify:track:35kahykNu00FPysz3C2euR,1000000,0
4,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,1000000,0
5,spotify:track:6WQLkih8nE0JdUCEyLaGnQ,1000016,3
6,spotify:track:37sINbJZcFdHFAsVNsPq1i,1000016,3
7,spotify:track:0yhPEz5KxlDwckGJaMlZqM,1000016,3
8,spotify:track:5j9iuo3tMmQIfnEEQOOjxh,1000016,3
9,spotify:track:4eLSCSELtKxZwXnFbNLXT5,1000016,3


In [200]:
# generate recommendations for each pid and creating submission file
def output(output_filename):
    
    model_path = 'spotify_NCF_8_[64, 32, 16, 8].h5'
    print('using model: %s' % model_path)
    model = load_model(model_path)
    print('Loaded model!')

    mlp_user_embedding_weights = (next(iter(filter(lambda x: x.name == 'mlp_user_embedding', model.layers))).get_weights())

    
    challenge_df['pidcat'] = challenge_df['pid'].astype('category').cat.codes
    
    first_line = 'team_info,team_name,main,your@email.com'
    recs = ['']
    start = time.time()
    count = 1
    with codecs.open(output_filename, "w") as o:
        o.write("%s \n" %(first_line))
        o.write("\n")
        o.write("\n")
        for playlist_id in challenge_df['pidcat']:
   
            desired_user_id = playlist_id
            user_latent_matrix = mlp_user_embedding_weights[0]
            one_user_vector = user_latent_matrix[desired_user_id,:]
            one_user_vector = np.reshape(one_user_vector, (1,32))
            #print('\nPerforming kmeans to find the nearest users/playlists...')
            
            kmeans = KMeans(n_clusters=100, random_state=0, verbose=0).fit(user_latent_matrix)
            desired_user_label = kmeans.predict(one_user_vector)
            user_label = kmeans.labels_
            neighbors = []
            for user_id, user_label in enumerate(user_label):
                if user_label == desired_user_label:
                    neighbors.append(user_id)
            #print('Found {0} neighbor users/playlists.'.format(len(neighbors)))
            
            tracks = []
            for user_id in neighbors:
                tracks += list(songPlaylist[songPlaylist['pid'] == int(user_id)]['trackindex'])
            #print('Found {0} neighbor tracks from these users.'.format(len(tracks))) 
            users = np.full(len(tracks), desired_user_id, dtype='int32')
            items = np.array(tracks, dtype='int32')
            results = model.predict([users,items],batch_size=100, verbose=0) 
            results = results.tolist()
            #print('Ranked the tracks!')

            results_df = pd.DataFrame(np.nan, index=range(len(results)), columns=['probability','trackid'])
            for i, prob in enumerate(results):
                results_df.loc[i] = [prob[0], songPlaylist[songPlaylist['trackindex'] == i].iloc[0]['trackid']]
            results_df = results_df.sort_values(by=['probability'], ascending=False).reset_index().drop(columns=['probability', 'index'])
            
            #dropping Duplicates.
            results_df = results_df.drop_duplicates(subset=['trackid'], keep='first')
            
            set_songP = set(songPlaylist1[songPlaylist1['pidcat'] == playlist_id]['trackid'])

            #Checking for seed track in the generated track list, if present, remove from the list
            final = []
            i = 500
            j = 0
            track_list = results_df['trackid']
            while i>0:
                if track_list[j] not in set_songP:
                    final.append(track_list[j])    
                    i -= 1
                j += 1
            final_df = pd.DataFrame(final, columns=['trackid'])
            
            pid = challenge_df.loc[challenge_df['pidcat'] == playlist_id]['pid'].values[0]
            for row in final_df["trackid"]:
                recs.append(row)
            o.write("%s" %(pid))
            recs = ', '.join(map(str, recs))
            o.write(recs)
            o.write("\n")
            o.write("\n")
            recs = ['']
            print(f"songs for playlist {count}")
            count+=1
    logging.debug("generated recommendations in %0.2fs",  time.time() - start)

In [None]:
output('data/myresults.csv')

using model: spotify_NCF_8_[64, 32, 16, 8].h5
Loaded model!
songs for playlist 1
songs for playlist 2
songs for playlist 3
songs for playlist 4
songs for playlist 5


In [None]:
print('if its not 9:30 am them its not complete')