In [67]:
import numpy as np
import scipy.sparse as sp
import theano
import theano.tensor as T
import os
os.environ['KERAS_BACKEND'] = 'theano'
import keras
from keras import backend as K
from keras import initializations
from keras.regularizers import l1, l2, l1l2
from keras.models import Sequential, Model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape,  Flatten, Dropout
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from time import time
import sys
import argparse
import csv
import pandas as pd
from math import ceil
from sklearn.preprocessing import OneHotEncoder


In [50]:
debug=1
rating_size=5 #used in one hot encoding

In [82]:
def init_normal(shape, name=None):
    return initializations.normal(shape, scale=0.01, name=name)

In [83]:
def get_model(num_users, num_items, mf_dim=10, layers=[10], reg_layers=[0], reg_mf=0):
    assert len(layers) == len(reg_layers)
    num_layer = len(layers) #Number of layers in the MLP
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
    
    # Embedding layer
    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = mf_dim, name = 'mf_embedding_user',
                                  init = init_normal, W_regularizer = l2(reg_mf), input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = mf_dim, name = 'mf_embedding_item',
                                  init = init_normal, W_regularizer = l2(reg_mf), input_length=1)   
    if(debug):
        print("layers values {}".format(layers[0]))
    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = int(layers[0]/2), name = "mlp_embedding_user",
                                  init = init_normal, W_regularizer = l2(reg_layers[0]), input_length=1)
    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = int(layers[0]/2), name = 'mlp_embedding_item',
                                  init = init_normal, W_regularizer = l2(reg_layers[0]), input_length=1)   
    
    # MF part
    MF_Embedding_User(user_input)
    mf_user_latent = Flatten()(MF_Embedding_User(user_input))
    mf_item_latent = Flatten()(MF_Embedding_Item(item_input))
    mf_vector = merge([mf_user_latent, mf_item_latent], mode = 'mul') # element-wise multiply

    # MLP part 
    mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
    mlp_item_latent = Flatten()(MLP_Embedding_Item(item_input))
    mlp_vector = merge([mlp_user_latent, mlp_item_latent], mode = 'concat')
    for idx in range(1, num_layer):
        layer = Dense(layers[idx], W_regularizer= l2(reg_layers[idx]), activation='relu', name="layer%d" %idx)
        mlp_vector = layer(mlp_vector)

    predict_vector = merge([mf_vector, mlp_vector], mode = 'concat')
    
    # Final prediction layer
    prediction=Dense(1, activation='relu',init='lecun_uniform', name = "prediction")(predict_vector)
    

    
    model = Model(input=[user_input, item_input], 
                  output=prediction)
    
    return model

In [84]:
def get_train_instances(train):
    user_input, item_input, labels = [],[],[]
    num_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(train[u,i])
    return user_input, item_input, labels

In [85]:
def read_dataset(train_file_name,test_file_name,ratings_file_name):
    train_df=pd.read_csv(train_file_name)
    test_df=pd.read_csv(test_file_name)
    ratings_file=pd.read_csv(ratings_file_name)
    movies=ratings_file['movieId'].unique()
    num_items=len(movies)
    num_users=len(ratings_file['userId'].unique())
    # movies_to_index={}
    # for i in range(len(movies)):
    #     movies_to_index[movies[i]]=i
    # index_to_movies=movies
    if(debug):
        print("In read dataset function...Number of items {}".format(num_items))
        print("In read dataset function...Number of users {}".format(num_users))
    trainMatrix= sp.dok_matrix((num_users+1,num_items+1),dtype=np.float32)
    for i in range (len(train_df)):
        current_user_num=train_df['userId'][i]
        current_item_num=train_df['movieId'][i]
        # current_item_num=movies_to_index[current_item_num]
        #print(current_user_num)
        #print(current_item_num)
        trainMatrix[current_user_num,current_item_num]=train_df['rating'][i]
    testRatings=[]
    testLabels=[]
    for i in range (len(test_df)):
        current_user_num=test_df['userId'][i]
        current_item_num=test_df['movieId'][i]
        # current_item_num=movies_to_index[current_item_num]
        testRatings.append([current_user_num,current_item_num])  
        testLabels.append(test_df['rating'][i])
    if(debug):
        print("dataset generation completed")
    return num_users,num_items,trainMatrix,testRatings,testLabels

In [128]:
def evaluate_model(model,user_input,item_input,labels, testRatings,testLabels):
    #metric MAE
    #for train
    print("RMSE,MAE on train set: {}".format(model.evaluate(x=[np.array(user_input),np.array(item_input)],y=np.array(labels),verbose=0)) )
    testRatingUsers=np.array(testRatings)[:,0]
    testRatingItems=np.array(testRatings)[:,1]
    print("RMSE,MAE on test set: {}".format(model.evaluate(x=[testRatingUsers,testRatingItems],y=np.array(testLabels),verbose=0)) )

In [129]:
if __name__ == '__main__':

#################### Arguments ####################

    num_epochs = 100    
    batch_size = 256
    mf_dim = 8
    layers = [64,32,16,8]
    reg_mf = 0
    reg_layers = [0,0,0,0]
    num_negatives = 4
    learning_rate = 1e-4
    learner = 'adam'
    verbose = 1
    mf_pretrain = ''
    mlp_pretrain = ''
    path='Data/' 
    test_file_name="test12.csv"
    train_file_name="train12.csv"
    ratings_file_name="ratings.csv" 
    topK = 10
    evaluation_threads = 1#mp.cpu_count()
    #model_out_file = 'Pretrain/%s_NeuMF_%d_%s_%d.h5' %(dataset, mf_dim, layers, time())
    # Loading data
    t1 = time()
    #dataset = Dataset(path + dataset)
    #train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
    num_users, num_items,train,testRatings,testLabels = read_dataset(path+train_file_name,path+test_file_name,path+ratings_file_name)
    print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d" 
          %(time()-t1, num_users, num_items, train.nnz, len(testRatings)))
    
    # Build model
    model = get_model(int(num_users+1), int(num_items+1), mf_dim, layers, reg_layers, reg_mf)

    model.compile(optimizer=Adam(lr=learning_rate), loss= ['mse'], metrics=['mean_absolute_error'])


        
    # Training model
    for epoch in range(num_epochs):
        t1 = time()
        # Generate training instances
        user_input, item_input, labels = get_train_instances(train)
        
        # Training
        hist = model.fit([np.array(user_input), np.array(item_input)], #input
                         np.array(labels), # labels 
                         batch_size=batch_size, nb_epoch=1, verbose=1, shuffle=True)
        t2 = time()
        
        # Evaluation
        if epoch %2 == 0:
            evaluate_model(model,user_input,item_input,labels, testRatings,testLabels)

    print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %(best_iter, best_hr, best_ndcg))
    if out > 0:
        print("The best NeuMF model is saved to %s" %(model_out_file))


In read dataset function...Number of items 9724
In read dataset function...Number of users 610
dataset generation completed
Load data done [5.4 s]. #user=610, #item=9724, #train=80669, #test=20167
layers values 64
Epoch 1/1
RMSE,MAE on train set: [7.671884774006136, 2.5743324053120666]
RMSE,MAE on test set: [7.8911848139643475, 2.6131724697469974]
Epoch 1/1
Epoch 1/1
RMSE,MAE on train set: [0.8198424077078034, 0.7049304804157757]
RMSE,MAE on test set: [1.1230725888241357, 0.8477608562003475]
Epoch 1/1
Epoch 1/1
RMSE,MAE on train set: [0.7030258622404192, 0.6441814227954873]
RMSE,MAE on test set: [1.057832613353152, 0.8188962694149435]
Epoch 1/1
Epoch 1/1
RMSE,MAE on train set: [0.6606441069305269, 0.6202708856293764]
RMSE,MAE on test set: [1.0742547480267466, 0.8226110258241682]
Epoch 1/1
Epoch 1/1
RMSE,MAE on train set: [0.6383909770395761, 0.6074220647208993]
RMSE,MAE on test set: [1.1124557013412233, 0.8350144089089924]
Epoch 1/1

KeyboardInterrupt: 