In [6]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import json, os, datetime

## Neural Collaborative Filtering with NeuMF

I currently have only GMF implemented here- I wanted to see how it works. I will implement NeuMF (which combines GMF and neural CF) over the next few days.

#### Architecture for the NeuMF model:
![NeuMF Architecture](img/neumf.png)

In [8]:
%%time
def make_playlist_df(path, num_slices):
    df_list = []
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if count < num_slices and filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            data = json.load(open(fullpath))
            slice_df = pd.DataFrame.from_dict(data['playlists'], orient='columns')
            df_list.append(slice_df)
            count += 1
    return pd.concat(df_list)

path = 'mpd.v1/data'
traindata = make_playlist_df(path, 10)

CPU times: user 2.08 s, sys: 515 ms, total: 2.6 s
Wall time: 2.85 s


In [10]:
# also read in the challenge dataset which has missing songs we 
# want our model to predict
t = json.load(open('challenge.v1/challenge_set.json'))
challenge_df = pd.DataFrame.from_dict(t['playlists'], orient='columns')
#Combine train and challenge so we can use cat code to map
#track ids to an index 0-N across both datasets
train_challengedata = pd.concat([traindata, challenge_df])

In [11]:
# turn playlist level dataframe into song level dataframe
songPlaylistArray = []
for index, row in train_challengedata.iterrows():
    for track in row['tracks']:
        songPlaylistArray.append([track['track_uri'], track['artist_name'], track['track_name'], row['pid'], row['num_holdouts']])
songPlaylist = pd.DataFrame(songPlaylistArray, columns=['trackid', 'artist_name', 'track_name', 'pid', 'num_holdouts'])

print(songPlaylist.shape)
songPlaylist.head(10)   #is a df of all track ids, cooresponding artist names, track names and playlist ids

(951568, 5)


Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0,
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0,
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0,
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0,
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0,
5,spotify:track:0XUfyU2QviPAs6bxSpXYG4,Usher,Yeah!,0,
6,spotify:track:68vgtRHr7iZHpzGpon6Jlo,Usher,My Boo,0,
7,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,The Pussycat Dolls,Buttons,0,
8,spotify:track:7H6ev70Weq6DdpZyyTmUXk,Destiny's Child,Say My Name,0,
9,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,OutKast,Hey Ya! - Radio Mix / Club Mix,0,


In [12]:
# Turn songs into their unqiue cat codes so we have a 0-N index for tracks
songPlaylist['trackindex'] = songPlaylist['trackid'].astype('category').cat.codes
print(len(songPlaylist['trackindex'].unique()))
songPlaylist.head(10)

189359


Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts,trackindex
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0,,12216
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0,,153108
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0,,13148
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0,,28658
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0,,43317
5,spotify:track:0XUfyU2QviPAs6bxSpXYG4,Usher,Yeah!,0,,13414
6,spotify:track:68vgtRHr7iZHpzGpon6Jlo,Usher,My Boo,0,,149445
7,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,The Pussycat Dolls,Buttons,0,,77773
8,spotify:track:7H6ev70Weq6DdpZyyTmUXk,Destiny's Child,Say My Name,0,,176962
9,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,OutKast,Hey Ya! - Radio Mix / Club Mix,0,,59024


In [13]:
# split apart training and challenge data
train = songPlaylist[pd.isnull(songPlaylist['num_holdouts'])]
challenge = songPlaylist[pd.notnull(songPlaylist['num_holdouts'])]
train.head(10)

Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts,trackindex
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0,,12216
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0,,153108
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0,,13148
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0,,28658
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0,,43317
5,spotify:track:0XUfyU2QviPAs6bxSpXYG4,Usher,Yeah!,0,,13414
6,spotify:track:68vgtRHr7iZHpzGpon6Jlo,Usher,My Boo,0,,149445
7,spotify:track:3BxWKCI06eQ5Od8TY2JBeA,The Pussycat Dolls,Buttons,0,,77773
8,spotify:track:7H6ev70Weq6DdpZyyTmUXk,Destiny's Child,Say My Name,0,,176962
9,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM,OutKast,Hey Ya! - Radio Mix / Club Mix,0,,59024


In [14]:
#Save data in dok matrix (optimized sparse matrix object)
    #Create a sparse pid x trackindex matrix
    #If a pid i has song j, mat[i,j]=1
mat = sp.dok_matrix((train.shape[0], len(songPlaylist['trackindex'].unique())), dtype=np.float32)
for pid, trackindex in zip(train['pid'], train['trackindex']):
    mat[pid, trackindex] = 1.0

In [35]:
%%time
import numpy as np
import theano.tensor as T
import keras
from keras import backend as K
from keras import initializers
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Multiply, Reshape, Merge, Flatten
from keras.optimizers import Adam
from keras.regularizers import l2
from time import time
# import multiprocessing as mp
import sys
import math

def init_normal(shape, name=None):
    return initializers.normal(shape, scale=0.01, name=name)

# currently just GMF
def get_model(num_playlists, num_items, latent_dim, regs=[0,0]):
    # Input variables
    playlist_input = Input(shape=(1,), dtype='int32', name = 'playlist_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MF_Embedding_playlist = Embedding(input_dim = num_playlists, output_dim = latent_dim, name = 'playlist_embedding',
                                  embeddings_initializer = initializers.RandomNormal(), embeddings_regularizer = l2(regs[0]), input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding',
                                  embeddings_initializer = initializers.RandomNormal(), embeddings_regularizer = l2(regs[1]), input_length=1)   
    
    # Crucial to flatten an embedding vector!
    playlist_latent = Flatten()(MF_Embedding_playlist(playlist_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))
    
    # Element-wise product of playlist and item embeddings 
    predict_vector = Multiply()([playlist_latent, item_latent])
    
    # Final prediction layer
    #prediction = Lambda(lambda x: K.sigmoid(K.sum(x)), output_shape=(1,))(predict_vector)
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(predict_vector)
    
    model = Model(inputs=[playlist_input, item_input], 
                outputs=prediction)
    return model

# get the training samples
def get_train_samples(train_mat, num_negatives):
    user_input, item_input, labels = [], [], []
    num_user, num_item = train_mat.shape
    for (u, i) in train_mat.keys():
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_item)
            while (u, j) in train_mat.keys():
                j = np.random.randint(num_item)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

# Specify hyperparameters
latent_dim = 8
regs = [0,0]
num_negatives = 4
learner = 'adam'
learning_rate = 0.001
epochs = 5
batch_size = 200
verbose = 1
    
# Loading data
train_mat = mat
num_users, num_items = train_mat.shape
print('Done loading data!')

Done loading data!
CPU times: user 188 µs, sys: 44 µs, total: 232 µs
Wall time: 216 µs


In [40]:
%%time
# Build model
model = get_model(num_playlists, num_items, latent_dim, regs)
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
    
# train model
# generate training instances
user_input, item_input, labels = get_train_samples(train_mat, num_negatives)

# training
# training
hist = model.fit([np.array(user_input), np.array(item_input)], np.array(labels), 
                 batch_size=batch_size, epochs=epochs, verbose=verbose, shuffle=True)
print(hist.history)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
playlist_input (InputLayer)     (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
playlist_embedding (Embedding)  (None, 1, 8)         5364544     playlist_input[0][0]             
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 8)         1514872     item_input[0][0]                 
__________________________________________________________________________________________________
flatten_25

(3309230,)
(3309230,)
(3309230,)
(670568, 189359)


In [44]:
print(len(songPlaylist['trackindex'].unique()))

189359
