In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import keras
from keras import backend as K
from keras import initializers
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Multiply, Reshape, Merge, Flatten, Concatenate
from keras.optimizers import Adam
from keras.regularizers import l2

import json, sys, random, os, datetime, math

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Neural Collaborative Filtering with NeuMF

I am going to use only 1/10 of the data (100,000 sample playlists) for training due to time and resource constraints. This approach should scale if you have a server at your disposal. The NeuMF model for neural collaborative filtering combines both GMF and MLP (neural net) approaches. 

### Architecture for the NeuMF model:
![NeuMF Architecture](img/neumf.png)
Image taken from [He et al, 2017](https://arxiv.org/pdf/1708.05031.pdf), who developed this model and describe the approach in greater detail.

### Steps
1. Playlist (more generally called u for user) and item (i) vectors are used to create embeddings (low-dimensional) for each playlist and item.
2. Generalized Matrix Factorization (GMF) combines the two embeddings using the dot product (this is the classic matrix factorization).
3. Multi-layer perceptron (MLP) can also create embeddings for user and items. However, instead of taking a dot product of these to obtain the rating, I can concatenate them to create a feature vector that is passed on to deeper layers.
4. NeuMF then combines the predictions from MLP and GMF to obtain the final prediction.

In [None]:
%%time
def make_playlist_df(path, num_slices):
    df_list = []
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if count < num_slices and filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            data = json.load(open(fullpath))
            slice_df = pd.DataFrame.from_dict(data['playlists'], orient='columns')
            df_list.append(slice_df)
            count += 1
    return pd.concat(df_list)

path = '../mpd.v1/data'
traindata = make_playlist_df(path, 10)

In [None]:
# also read in the challenge dataset which has missing songs
# that I want the model to predict
t = json.load(open('../challenge.v1/challenge_set.json'))
challenge_df = pd.DataFrame.from_dict(t['playlists'], orient='columns')

# combine train and challenge so can use cat code to map
# track ids to an index 0-N across both datasets
train_challengedata = pd.concat([traindata, challenge_df])

In [None]:
# turn playlist level dataframe into song level dataframe
songPlaylistArray = []
for index, row in train_challengedata.iterrows():
    for track in row['tracks']:
        songPlaylistArray.append([track['track_uri'], track['artist_name'], track['track_name'], row['pid'], row['num_holdouts']])
songPlaylist = pd.DataFrame(songPlaylistArray, columns=['trackid', 'artist_name', 'track_name', 'pid', 'num_holdouts'])

print(songPlaylist.shape)
songPlaylist.head()   # is a df of all track ids, corresponding artist names, track names and playlist ids

In [None]:
# turn songs into their unique cat codes so have a 0-N index for tracks
songPlaylist['trackindex'] = songPlaylist['trackid'].astype('category').cat.codes
print(len(songPlaylist['trackindex'].unique()))
songPlaylist.head()

In [None]:
# split training and challenge data
train = songPlaylist[pd.isnull(songPlaylist['num_holdouts'])]
challenge = songPlaylist[pd.notnull(songPlaylist['num_holdouts'])]
train.head()

In [None]:
challenge.head()

In [None]:
%%time
# save data in dok matrix (optimized sparse matrix object)
# create a sparse playlistid x trackindex matrix
# if a playlistid i has song j, mat[i,j]=1
mat = sp.dok_matrix((train.shape[0], len(songPlaylist['trackindex'].unique())), dtype=np.float32)
for pid, trackindex in zip(train['pid'], train['trackindex']):
    mat[pid, trackindex] = 1.0

In [None]:
# full NCF model
def get_model(num_playlists, num_items, latent_dim=8, dense_layers=[64, 32, 16, 8],
              reg_layers=[0, 0, 0, 0], reg_mf=0):

    # input layer
    input_user = Input(shape=(1,), dtype='int32', name='user_input')
    input_item = Input(shape=(1,), dtype='int32', name='item_input')
    
    # embedding layer
    mf_user_embedding = Embedding(input_dim=num_playlists, output_dim=latent_dim,
                        name='mf_user_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mf_item_embedding = Embedding(input_dim=num_items, output_dim=latent_dim,
                        name='mf_item_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mlp_user_embedding = Embedding(input_dim=num_playlists, output_dim=int(dense_layers[0]/2),
                         name='mlp_user_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=int(dense_layers[0]/2),
                         name='mlp_item_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)

    # MF latent vector
    mf_user_latent = Flatten()(mf_user_embedding(input_user))
    mf_item_latent = Flatten()(mf_item_embedding(input_item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP latent vector
    mlp_user_latent = Flatten()(mlp_user_embedding(input_user))
    mlp_item_latent = Flatten()(mlp_item_embedding(input_item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])
    
    mlp_vector = mlp_cat_latent
    
    # build dense layer for model
    for i in range(1,len(dense_layers)):
        layer = Dense(dense_layers[i],
                      activity_regularizer=l2(reg_layers[i]),
                      activation='relu',
                      name='layer%d' % i)
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])
    result = Dense(1, activation='sigmoid', 
                   kernel_initializer='lecun_uniform',name='result')

    model = Model(inputs=[input_user,input_item], outputs=result(predict_layer))

    return model

# get the training samples
def get_train_samples(train_mat, num_negatives):
    playlist_input, item_input, labels = [], [], []
    num_user, num_item = train_mat.shape
    for (u, i) in train_mat.keys():
        playlist_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_item)
            while (u, j) in train_mat.keys():
                j = np.random.randint(num_item)
            playlist_input.append(u)
            item_input.append(j)
            labels.append(0)
    return playlist_input, item_input, labels

# hyperparameters
verbose = 1
num_epochs = 1      # 10?
batch_size = 256
latent_dim = 8
dense_layers = [64, 32, 16, 8]
reg_layers = [0, 0, 0, 0]
reg_mf = [0]
num_negatives = 4
learning_rate = 0.001
learner = 'adam'
dataset = 'spotify'

# loading data
train_mat = mat
num_playlists, num_items = train_mat.shape
print('Done loading data!')

In [None]:
%%time

# get model
model = get_model(num_playlists, num_items, latent_dim, dense_layers, reg_layers, reg_mf)
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
    
# train model
# generate training instances
playlist_input, item_input, labels = get_train_samples(train_mat, num_negatives)

# training
hist = model.fit([np.array(user_input), np.array(item_input)], np.array(labels), 
                 batch_size=batch_size, epochs=epochs, verbose=verbose, shuffle=True)
print(hist.history)

# save model
model_file = '%s_NCF_%d_%s.h5' % (dataset, latent_dim, str(dense_layers))
model.save(model_file, overwrite=True)