In [2]:
import numpy as np
import theano.tensor as T
import keras
from keras import backend as K
from keras import initializations
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from keras.regularizers import l2


Using Theano backend.


In [3]:
import sys
import math
import argparse
from time import time

In [4]:
from dataset import Dataset

  (94, 4219)	1.0
  (17, 979)	1.0
  (331, 13828)	1.0
  (104, 5479)	1.0
  (481, 20172)	1.0
  (792, 28958)	1.0
  (965, 3542)	1.0
  (26, 931)	1.0
  (105, 5577)	1.0
  (134, 350)	1.0
  (483, 20281)	1.0
  (946, 28275)	1.0
  (130, 6777)	1.0
  (262, 12086)	1.0
  (955, 2063)	1.0
  (454, 10010)	1.0
  (456, 18970)	1.0
  (272, 5569)	1.0
  (751, 1046)	1.0
  (294, 7527)	1.0
  (381, 14077)	1.0
  (452, 1064)	1.0
  (968, 18113)	1.0
  (380, 311)	1.0
  (182, 8986)	1.0
  :	:
  (377, 15936)	1.0
  (553, 22172)	1.0
  (72, 3868)	1.0
  (932, 13269)	1.0
  (27, 1311)	1.0
  (768, 9736)	1.0
  (242, 2348)	1.0
  (456, 17401)	1.0
  (97, 5010)	1.0
  (573, 18191)	1.0
  (895, 1051)	1.0
  (689, 7178)	1.0
  (439, 6507)	1.0
  (262, 1)	1.0
  (470, 19770)	1.0
  (548, 2830)	1.0
  (448, 18705)	1.0
  (83, 4172)	1.0
  (453, 17920)	1.0
  (570, 2410)	1.0
  (955, 33256)	1.0
  (357, 15324)	1.0
  (794, 934)	1.0
  (651, 1011)	1.0
  (68, 3499)	1.0
1001 34444


In [5]:
#################### Arguments ####################
def parse_args():
    parser = argparse.ArgumentParser(description="Run GMF.")
    parser.add_argument('--path', nargs='?', default='',
                        help='Input data path.')
    parser.add_argument('--dataset', nargs='?', default='mpd.slice.0-999.json',
                        help='Choose a dataset.')
    parser.add_argument('--epochs', type=int, default=10,
                        help='Number of epochs.')
    parser.add_argument('--batch_size', type=int, default=64,
                        help='Batch size.')
    parser.add_argument('--num_factors', type=int, default=8,
                        help='Embedding size.')
    parser.add_argument('--regs', nargs='?', default='[0,0]',
                        help="Regularization for user and item embeddings.")
    parser.add_argument('--num_neg', type=int, default=4,
                        help='Number of negative instances to pair with a positive instance.')
    parser.add_argument('--lr', type=float, default=0.001,
                        help='Learning rate.')
    parser.add_argument('--learner', nargs='?', default='adam',
                        help='Specify an optimizer: adagrad, adam, rmsprop, sgd')
    parser.add_argument('--verbose', type=int, default=1,
                        help='Show performance per X iterations')
    parser.add_argument('--out', type=int, default=1,
                        help='Whether to save the trained model.')
    return parser.parse_args()

We create the negative cases by generating a random number (until we get one for which there is not entry in the dataset).

In [23]:
def get_train_instances(train, pMat, num_negatives):
    user_input, item_input, labels = [],[],[]
    num_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u, j) in pMat.keys():
                j = np.random.randint(num_items)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

In [7]:
def init_normal(shape, name=None):
    return initializations.normal(shape, scale=0.01, name=name)

In [8]:
def get_model(num_users, num_items, latent_dim, regs=[0,0]):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding',
                                  init = init_normal, W_regularizer = l2(regs[0]), input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding',
                                  init = init_normal, W_regularizer = l2(regs[1]), input_length=1)   
    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MF_Embedding_User(user_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))
    
    # Element-wise product of user and item embeddings 
    predict_vector = merge([user_latent, item_latent], mode = 'mul')
    
    # Final prediction layer
    #prediction = Lambda(lambda x: K.sigmoid(K.sum(x)), output_shape=(1,))(predict_vector)
    prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = 'prediction')(predict_vector)
    
    model = Model(input=[user_input, item_input], 
                output=prediction)

    return model

In [25]:
if __name__ == '__main__':
    args = parse_args()
    num_factors = args.num_factors
    regs = eval(args.regs)
    num_negatives = args.num_neg
    learner = args.learner
    learning_rate = args.lr
    epochs = args.epochs
    batch_size = args.batch_size
    verbose = args.verbose
    
    print("GMF arguments: %s" %(args))
    model_out_file = 'Pretrain/%s_GMF_%d_%d.h5' %(args.dataset, num_factors, time())
    
    # Loading data
    dataset = Dataset(args.path + args.dataset)
    train = dataset.trainMatrix
    pMat = dataset.playlistMatrix 
    num_users, num_items = train.shape
    print("Loaded data")
    
    # Build model
    model = get_model(num_users, num_items, num_factors, regs)
    if learner.lower() == "adagrad": 
        model.compile(optimizer=Adagrad(lr=learning_rate), loss='binary_crossentropy')
    elif learner.lower() == "rmsprop":
        model.compile(optimizer=RMSprop(lr=learning_rate), loss='binary_crossentropy')
    elif learner.lower() == "adam":
        model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy')
    else:
        model.compile(optimizer=SGD(lr=learning_rate), loss='binary_crossentropy')
    print(model.summary())
    
    
    # Train model
    best_iter =  -1
    for epoch in range(epochs):
        t1 = time()
        # Generate training instances
        user_input, item_input, labels = get_train_instances(train, pMat, num_negatives)
        
        # Training
        hist = model.fit([np.array(user_input), np.array(item_input)], #input
                         np.array(labels), # labels 
                         batch_size=batch_size, nb_epoch=1, verbose=0, shuffle=True)
        t2 = time()
        
        # Evaluation
        if epoch %verbose == 0:
            loss = hist.history['loss'][0]
            print('Iteration %d [%.1f s]: loss = %.4f [%.1f s]' 
                  % (epoch,  t2-t1, loss, time()-t2))
            if args.out > 0:
                model.save_weights(model_out_file, overwrite=True)

    if args.out > 0:
        print("The GMF model is saved to %s" %(model_out_file))
    u = 262
    items = [1064 ,174 ,2791 ,3373 ,269 ,2678 ,1902 ,3641 ,1216 ,915 ,3672 ,2803 ,2344 ,986 ,3217 ,2824 ,2598 ,464 ,2340]
    items.append(12086)
    user = np.full(len(items), u, dtype = 'int32')
    
    predictions = model.predict([user, np.array(items)])
    print(predictions)

usage: ipykernel_launcher.py [-h] [--path [PATH]] [--dataset [DATASET]]
                             [--epochs EPOCHS] [--batch_size BATCH_SIZE]
                             [--num_factors NUM_FACTORS] [--regs [REGS]]
                             [--num_neg NUM_NEG] [--lr LR]
                             [--learner [LEARNER]] [--verbose VERBOSE]
                             [--out OUT]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1000/jupyter/kernel-aa2a4af1-ce19-4320-aa1d-d952b609b13c.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


python3 GMF.py --dataset ml-1m --epochs 20 --batch_size 256 --num_factors 8 --regs [0,0] --num_neg 4 --lr 0.001 --learner adam --verbose 1 --out 1

In [26]:
! python3 GMF.py

Using Theano backend.
1.0
66721
1001 34444
GMF arguments: Namespace(batch_size=64, dataset='mpd.slice.0-999.json', epochs=10, learner='adam', lr=0.001, num_factors=8, num_neg=4, out=1, path='', regs='[0,0]', verbose=1)
Loaded data
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_input (InputLayer)          (None, 1)             0                                            
____________________________________________________________________________________________________
item_input (InputLayer)          (None, 1)             0                                            
____________________________________________________________________________________________________
user_embedding (Embedding)       (None, 1, 8)          8008        user_input[0][0]                 
______________________________________________________________

In [14]:
import scipy.sparse as sp

In [15]:
mat = sp.dok_matrix((1, 1), dtype=np.float32)

In [None]:
print(mat.has_key(1))

In [None]:
#predicting for one playlist
