In [1]:
## Import Packages:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import theano
import theano.tensor as T
import keras
from keras import backend as K
from keras import initializers
from keras.regularizers import l1, l2, l1_l2
from keras.models import Sequential, Model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Concatenate, Reshape, Multiply, Flatten, Dropout
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from evaluate import evaluate_model
from Dataset import Dataset
from time import time
import sys
import GMF, MLP
import argparse

Using TensorFlow backend.


## Read in Champaign user-item interaction csv file to generate dataset:

In [2]:
champaign_user_item_df = pd.read_csv('../yelp_dataset/champaign_user_item.csv', index_col = 0)
champaign_user_item_df.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_business_avg,review_count,...,categories,hours,review_id,user_id,stars,useful,funny,cool,text,date
0,4hWDMVtfnpyY72_5QMbthA,Old Time Meat & Deli Shoppe,2018 S Neil St,urbana-champaign,IL,61820,40.091901,-88.245362,5.0,46,...,"Meat Shops, Food, Restaurants, Specialty Food,...","{'Monday': '10:0-18:0', 'Tuesday': '10:0-18:0'...",WruGbPJAuXAl5GP28fH1cA,5rM50rIojxWNQRv0Vmuulg,5.0,0,0,0,This is a wonderful butchershop. Deli selectio...,2011-08-25 01:04:12
1,4hWDMVtfnpyY72_5QMbthA,Old Time Meat & Deli Shoppe,2018 S Neil St,urbana-champaign,IL,61820,40.091901,-88.245362,5.0,46,...,"Meat Shops, Food, Restaurants, Specialty Food,...","{'Monday': '10:0-18:0', 'Tuesday': '10:0-18:0'...",NozR-GCiMHWyhqRbW4TDuA,9ZuE5YJQvNA4PvldrTgmXA,5.0,1,1,1,I'm almost reluctant to give it 5 stars only s...,2016-04-12 05:07:23
2,4hWDMVtfnpyY72_5QMbthA,Old Time Meat & Deli Shoppe,2018 S Neil St,urbana-champaign,IL,61820,40.091901,-88.245362,5.0,46,...,"Meat Shops, Food, Restaurants, Specialty Food,...","{'Monday': '10:0-18:0', 'Tuesday': '10:0-18:0'...",wLWG8YkhTfu3zjAYGNSzfA,w-NiZLoY-TetMRY8LOqMKw,5.0,1,0,2,This is definitely the place to go for excelle...,2016-07-04 01:09:58


In [3]:
unique_business_id = champaign_user_item_df.business_id.unique()
mapping_business_id = {}
ctr = 0
for business_id in unique_business_id:
    mapping_business_id[business_id] = ctr
    ctr += 1
    
champaign_user_item_df['business_id_refined'] = champaign_user_item_df.business_id.map(mapping_business_id)

In [4]:
unique_user_id = champaign_user_item_df.user_id.unique()
mapping_user_id = {}
ctr = 0
for user_id in unique_user_id:
    mapping_user_id[user_id] = ctr
    ctr += 1
    
champaign_user_item_df['user_id_refined'] = champaign_user_item_df.user_id.map(mapping_user_id)

In [5]:
champaign_user_item_df[['user_id_refined', 'business_id_refined']].head(5)

Unnamed: 0,user_id_refined,business_id_refined
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [6]:
sum(champaign_user_item_df.groupby('user_id_refined')['name'].count() >= 10)

502

There are only 502 users who have reviewed 10 or more restaurants in champaign, hence need to expand the dataset a bit

## Read in toronto user-item interaction csv file to generate dataset:

In [7]:
toronto_user_item_df = pd.read_csv('../yelp_dataset/toronto_user_item.csv', index_col = 0)
toronto_user_item_df.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_business_avg,review_count,...,categories,hours,review_id,user_id,stars,useful,funny,cool,text,date
0,g6AFW-zY0wDvBl9U82g4zg,Baretto Caffe,1262 Don Mills Road,toronto,ON,M3B 2W7,43.744703,-79.346468,5.0,317,...,"Restaurants, Italian, Cafes","{'Monday': '7:30-18:0', 'Tuesday': '7:30-18:0'...",SKBNW4QKNiclQ6mB2AQ8MQ,q3JSVBWICgXfO-zuLAp5fg,3.0,0,0,0,The customer service is on point. The food was...,2018-10-04 10:57:11
1,g6AFW-zY0wDvBl9U82g4zg,Baretto Caffe,1262 Don Mills Road,toronto,ON,M3B 2W7,43.744703,-79.346468,5.0,317,...,"Restaurants, Italian, Cafes","{'Monday': '7:30-18:0', 'Tuesday': '7:30-18:0'...",0dsaJN8eljlYRCqPWN1JCQ,0zW0RwIRwyJ6Qdirqvs5gA,5.0,0,0,0,The staff and workers are really friendly and ...,2017-04-30 13:40:40
2,g6AFW-zY0wDvBl9U82g4zg,Baretto Caffe,1262 Don Mills Road,toronto,ON,M3B 2W7,43.744703,-79.346468,5.0,317,...,"Restaurants, Italian, Cafes","{'Monday': '7:30-18:0', 'Tuesday': '7:30-18:0'...",aPUINDQsgifg_hSROs4TTA,eurxcv4blzrEs7-IgLGt5w,5.0,0,0,0,This is one great cafe. A little hard to find ...,2015-03-18 22:16:23


In [8]:
sum(toronto_user_item_df.groupby('user_id')['name'].count() >= 10)

7905

There are a substantial number of users who have reviewed at least 10 restaurants

### Filter the dataset to contain only the users who have reviewed at least 10 restaurants or more

In [9]:
grouped = toronto_user_item_df.groupby('user_id')
toronto_user_item_filtered_df = grouped.filter(lambda x: x['name'].count() >= 10)

In [10]:
dataset_to_use = toronto_user_item_filtered_df.copy()

In [11]:
print('There are %d unique users and %d unique items in the dataset after filtering such that each user has \
reviewed at least 10 restaurants.'%(dataset_to_use.user_id.nunique(), dataset_to_use.business_id.nunique()))

There are 7905 unique users and 8546 unique items in the dataset after filtering such that each user has reviewed at least 10 restaurants.


In [12]:
dataset_to_use.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_business_avg,review_count,...,categories,hours,review_id,user_id,stars,useful,funny,cool,text,date
3,g6AFW-zY0wDvBl9U82g4zg,Baretto Caffe,1262 Don Mills Road,toronto,ON,M3B 2W7,43.744703,-79.346468,5.0,317,...,"Restaurants, Italian, Cafes","{'Monday': '7:30-18:0', 'Tuesday': '7:30-18:0'...",l8FlUGAgrAAOIi0fWV3Lgg,ZWpLKIbOC5xjuPWc7ZKe9Q,5.0,0,0,0,"Wonderful spaghetti, simple yet clean environm...",2018-09-03 18:13:28
5,g6AFW-zY0wDvBl9U82g4zg,Baretto Caffe,1262 Don Mills Road,toronto,ON,M3B 2W7,43.744703,-79.346468,5.0,317,...,"Restaurants, Italian, Cafes","{'Monday': '7:30-18:0', 'Tuesday': '7:30-18:0'...",N_UO6AguthYg7lK2NoduZA,GGI39_EL1ERSqyWX1tEjMA,5.0,11,3,7,A hidden gem near my home. Found this place wh...,2017-08-16 19:45:54
6,g6AFW-zY0wDvBl9U82g4zg,Baretto Caffe,1262 Don Mills Road,toronto,ON,M3B 2W7,43.744703,-79.346468,5.0,317,...,"Restaurants, Italian, Cafes","{'Monday': '7:30-18:0', 'Tuesday': '7:30-18:0'...",I_nbSUj8mv0BB9Zgx6--UQ,x0cMhVpUcYYHoLdrWSNIMg,5.0,3,0,0,Ambiance/decor- 4\nService- 5+\nFood - 5\nStri...,2015-10-09 00:33:14


### Create simpler IDs for users and items

In [13]:
unique_business_id = dataset_to_use.business_id.unique()
mapping_business_id = {}
ctr = 0
for business_id in unique_business_id:
    mapping_business_id[business_id] = ctr
    ctr += 1
    
dataset_to_use['business_id_refined'] = dataset_to_use.business_id.map(mapping_business_id)

In [14]:
unique_user_id = dataset_to_use.user_id.unique()
mapping_user_id = {}
ctr = 0
for user_id in unique_user_id:
    mapping_user_id[user_id] = ctr
    ctr += 1
    
dataset_to_use['user_id_refined'] = dataset_to_use.user_id.map(mapping_user_id)

In [15]:
dataset_to_use[['user_id_refined', 'business_id_refined']].head(5)

Unnamed: 0,user_id_refined,business_id_refined
3,0,0
5,1,0
6,2,0
7,3,0
10,4,0


In [16]:
dataset_to_use.sort_values(by = ['user_id_refined', 'date'], inplace = True)

In [17]:
dataset_to_use.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_business_avg,review_count,...,review_id,user_id,stars,useful,funny,cool,text,date,business_id_refined,user_id_refined
394705,e49eXgKVuR-lsL0-D4vzDw,Momiji,2111 Sheppard Avenue E,toronto,ON,M2J 1W6,43.775377,-79.333972,3.0,22,...,9kb3ywKCxhCQY0ElsLccNA,ZWpLKIbOC5xjuPWc7ZKe9Q,3.0,3,0,2,I went to Momiji at night wanting to find out ...,2010-11-01 01:50:56,6217,0
270638,ik9VvawL-BeAqlxTI1leew,Gonoe Sushi,1310 Don Mills Road,toronto,ON,M3B 2W6,43.74592,-79.346301,3.5,119,...,ehAgpX1OzHGnkf1fut6Few,ZWpLKIbOC5xjuPWc7ZKe9Q,3.0,2,0,0,I went to this place solely on the recommendat...,2014-12-23 02:53:08,3521,0
119044,Nz44ccUso3nq5S2OlQHNlA,Mexico Lindo,"2600 Birchmount Road, Suite 2586",toronto,ON,M1T 2M5,43.789719,-79.302981,4.0,163,...,nieXZ7BPbe_4X4lJexK--w,ZWpLKIbOC5xjuPWc7ZKe9Q,5.0,0,0,0,"Homemade family style catering, I was welcome ...",2014-12-31 02:27:56,1264,0


### Test Data:

In [18]:
def last_value(df):
    return df.iloc[[-1]]

In [19]:
# Save the most recent visit to a restaurant as the test data
test_data = dataset_to_use.groupby('user_id_refined').apply(last_value)

In [20]:
### Sanity check to make sure that the most recent transaction is saved
most_recent_restaurant = []
for idx in range(len(test_data)):
    most_recent_restaurant.append(dataset_to_use.query('user_id_refined == @idx')['business_id_refined'].iloc[-1])
    
if sum(test_data.business_id_refined != most_recent_restaurant):
    print('not successful')
else:
    print('successful!')

successful!


'successful!' indicates that we were successfully able to save the most recent visit by every user in a separate dataframe.

This dataframe upon transformation would serve as a test dataset.

In [21]:
test_data.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_business_avg,review_count,...,review_id,user_id,stars,useful,funny,cool,text,date,business_id_refined,user_id_refined
user_id_refined,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,3,g6AFW-zY0wDvBl9U82g4zg,Baretto Caffe,1262 Don Mills Road,toronto,ON,M3B 2W7,43.744703,-79.346468,5.0,317,...,l8FlUGAgrAAOIi0fWV3Lgg,ZWpLKIbOC5xjuPWc7ZKe9Q,5.0,0,0,0,"Wonderful spaghetti, simple yet clean environm...",2018-09-03 18:13:28,0,0
1,50691,B70iTJjcPkuYn8ouUewWgw,Gusto 101,101 Portland Street,toronto,ON,M5V 2N3,43.645002,-79.400309,4.0,956,...,QOo-iNFVehBpfF_3cx_cdw,GGI39_EL1ERSqyWX1tEjMA,3.0,2,2,3,Ahh! Confused to rate between 4 & 3 . I would ...,2019-09-19 21:38:53,1048,1
2,185431,mVVoK2ADlKbvwitSqFY2hw,Pantry Foods,3456 Bathurst Street,toronto,ON,M6A 2C3,43.72813,-79.43202,4.0,20,...,HKo22Pcdk88c1eq2gHVwjg,x0cMhVpUcYYHoLdrWSNIMg,5.0,0,0,0,Popped in for lunch and all the food was delic...,2019-04-29 17:28:33,2313,2


In [22]:
dataset_to_use.query('user_id_refined == 0')['business_id_refined'].iloc[-1],\
dataset_to_use.query('user_id_refined == 1')['business_id_refined'].iloc[-1],\
dataset_to_use.query('user_id_refined == 2')['business_id_refined'].iloc[-1]

(0, 1048, 2313)

In [23]:
test_data = test_data.droplevel(0)
test_data.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_business_avg,review_count,...,review_id,user_id,stars,useful,funny,cool,text,date,business_id_refined,user_id_refined
3,g6AFW-zY0wDvBl9U82g4zg,Baretto Caffe,1262 Don Mills Road,toronto,ON,M3B 2W7,43.744703,-79.346468,5.0,317,...,l8FlUGAgrAAOIi0fWV3Lgg,ZWpLKIbOC5xjuPWc7ZKe9Q,5.0,0,0,0,"Wonderful spaghetti, simple yet clean environm...",2018-09-03 18:13:28,0,0
50691,B70iTJjcPkuYn8ouUewWgw,Gusto 101,101 Portland Street,toronto,ON,M5V 2N3,43.645002,-79.400309,4.0,956,...,QOo-iNFVehBpfF_3cx_cdw,GGI39_EL1ERSqyWX1tEjMA,3.0,2,2,3,Ahh! Confused to rate between 4 & 3 . I would ...,2019-09-19 21:38:53,1048,1
185431,mVVoK2ADlKbvwitSqFY2hw,Pantry Foods,3456 Bathurst Street,toronto,ON,M6A 2C3,43.72813,-79.43202,4.0,20,...,HKo22Pcdk88c1eq2gHVwjg,x0cMhVpUcYYHoLdrWSNIMg,5.0,0,0,0,Popped in for lunch and all the food was delic...,2019-04-29 17:28:33,2313,2


In [24]:
testRatings = test_data[['user_id_refined', 'business_id_refined']].values.tolist()
testRatings

[[0, 0],
 [1, 1048],
 [2, 2313],
 [3, 289],
 [4, 1107],
 [5, 2111],
 [6, 1264],
 [7, 350],
 [8, 3278],
 [9, 5107],
 [10, 1396],
 [11, 470],
 [12, 5487],
 [13, 169],
 [14, 5517],
 [15, 3986],
 [16, 6502],
 [17, 4106],
 [18, 1754],
 [19, 1153],
 [20, 1193],
 [21, 38],
 [22, 7349],
 [23, 1916],
 [24, 5178],
 [25, 211],
 [26, 0],
 [27, 42],
 [28, 3677],
 [29, 3535],
 [30, 0],
 [31, 1891],
 [32, 6612],
 [33, 565],
 [34, 1095],
 [35, 5542],
 [36, 235],
 [37, 3802],
 [38, 3910],
 [39, 0],
 [40, 1141],
 [41, 7551],
 [42, 949],
 [43, 231],
 [44, 2065],
 [45, 3329],
 [46, 6128],
 [47, 1288],
 [48, 1992],
 [49, 615],
 [50, 3582],
 [51, 7476],
 [52, 2092],
 [53, 1581],
 [54, 4358],
 [55, 3821],
 [56, 3531],
 [57, 160],
 [58, 231],
 [59, 7168],
 [60, 438],
 [61, 1280],
 [62, 3279],
 [63, 2813],
 [64, 4111],
 [65, 7264],
 [66, 6418],
 [67, 2398],
 [68, 384],
 [69, 162],
 [70, 5626],
 [71, 0],
 [72, 1468],
 [73, 1919],
 [74, 0],
 [75, 309],
 [76, 3394],
 [77, 0],
 [78, 3193],
 [79, 5702],
 [80, 1575]

## Train Data (Interaction Matrix)

In [25]:
import scipy as sp

In [26]:
# Create a scipy dok matrix of user-item interaction to serve as input
train_data = sp.sparse.dok_matrix((dataset_to_use.user_id.nunique(), dataset_to_use.business_id.nunique()),\
                                  dtype = np.float32)
# key is (user_id, business_id) and value is 1.0 (to indicate an interaction)
# a value of 1 simply indicates that there is interaction between the user and the item 
# we do not comment whether it is a bad interaction or a good one

testRatings_set = set(tuple(x) for x in testRatings)

for i in range(len(dataset_to_use)):
    user, business = dataset_to_use.user_id_refined.iloc[i], dataset_to_use.business_id_refined.iloc[i]
    
    # check if the given pair of user and business is already a part of 'test' set (skip if already a part)
    if (user, business) not in testRatings_set:
        train_data[user, business] = 1.

In [27]:
train_data

<7905x8546 sparse matrix of type '<class 'numpy.float32'>'
	with 219758 stored elements in Dictionary Of Keys format>

# NCF Implementation:

In [6]:
'''
Created on Aug 9, 2016
Keras Implementation of Neural Matrix Factorization (NeuMF) recommender model in:
He Xiangnan et al. Neural Collaborative Filtering. In WWW 2017.  

@author: Xiangnan He (xiangnanhe@gmail.com)
'''
#################### Arguments ####################
def parse_args():
    parser = argparse.ArgumentParser(description="Run NeuMF.")
    parser.add_argument('--path', nargs='?', default='Data/',
                        help='Input data path.')
    parser.add_argument('--dataset', nargs='?', default='ml-1m',
                        help='Choose a dataset.')
    parser.add_argument('--epochs', type=int, default=100,
                        help='Number of epochs.')
    parser.add_argument('--batch_size', type=int, default=256,
                        help='Batch size.')
    parser.add_argument('--num_factors', type=int, default=8,
                        help='Embedding size of MF model.')
    parser.add_argument('--layers', nargs='?', default='[64,32,16,8]',
                        help="MLP layers. Note that the first layer is the concatenation of user and item embeddings. So layers[0]/2 is the embedding size.")
    parser.add_argument('--reg_mf', type=float, default=0,
                        help='Regularization for MF embeddings.')                    
    parser.add_argument('--reg_layers', nargs='?', default='[0,0,0,0]',
                        help="Regularization for each MLP layer. reg_layers[0] is the regularization for embeddings.")
    parser.add_argument('--num_neg', type=int, default=4,
                        help='Number of negative instances to pair with a positive instance.')
    parser.add_argument('--lr', type=float, default=0.001,
                        help='Learning rate.')
    parser.add_argument('--learner', nargs='?', default='adam',
                        help='Specify an optimizer: adagrad, adam, rmsprop, sgd')
    parser.add_argument('--verbose', type=int, default=1,
                        help='Show performance per X iterations')
    parser.add_argument('--out', type=int, default=1,
                        help='Whether to save the trained model.')
    parser.add_argument('--mf_pretrain', nargs='?', default='',
                        help='Specify the pretrain model file for MF part. If empty, no pretrain will be used')
    parser.add_argument('--mlp_pretrain', nargs='?', default='',
                        help='Specify the pretrain model file for MLP part. If empty, no pretrain will be used')
    return parser.parse_args()

def get_model(num_users, num_items, mf_dim=10, layers=[10], reg_layers=[0], reg_mf=0):
    assert len(layers) == len(reg_layers)
    num_layer = len(layers) #Number of layers in the MLP
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
    
    print('Input layer: ', user_input.shape, user_input.dtype)
    
    # Embedding layer
    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = mf_dim, name = 'mf_embedding_user',
                                  embeddings_initializer = 'random_normal', W_regularizer = l2(reg_mf), input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = mf_dim, name = 'mf_embedding_item',
                                  embeddings_initializer = 'random_normal', W_regularizer = l2(reg_mf), input_length=1)  

    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = int(layers[0]/2), name = "mlp_embedding_user",
                                   embeddings_initializer = 'random_normal', W_regularizer = l2(reg_layers[0]), input_length=1)
    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = int(layers[0]/2), name = 'mlp_embedding_item',
                                   embeddings_initializer = 'random_normal', W_regularizer = l2(reg_layers[0]), input_length=1)

    output = MLP_Embedding_User(user_input)
    #print(MLP_Embedding_User.weights)
    
    # MF part
    mf_user_latent = Flatten()(MF_Embedding_User(user_input))
    mf_item_latent = Flatten()(MF_Embedding_Item(item_input))
    
    mf_vector = Multiply()([mf_user_latent, mf_item_latent]) # element-wise multiply

    # MLP part 
    mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
    mlp_item_latent = Flatten()(MLP_Embedding_Item(item_input))
    
    mlp_vector = Concatenate()([mlp_user_latent, mlp_item_latent])
    for idx in range(1, num_layer):
        layer = Dense(layers[idx], W_regularizer= l2(reg_layers[idx]), activation='relu', name="layer%d" %idx)
        mlp_vector = layer(mlp_vector)

    # Concatenate MF and MLP parts
    #mf_vector = Lambda(lambda x: x * alpha)(mf_vector)
    #mlp_vector = Lambda(lambda x : x * (1-alpha))(mlp_vector)
    predict_vector = Concatenate()([mf_vector, mlp_vector])
    
    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = "prediction")(predict_vector)
    
    model = Model(input=[user_input, item_input], 
                  output=prediction)
    
    return model

def load_pretrain_model(model, gmf_model, mlp_model, num_layers):
    # MF embeddings
    gmf_user_embeddings = gmf_model.get_layer('user_embedding').get_weights()
    gmf_item_embeddings = gmf_model.get_layer('item_embedding').get_weights()
    model.get_layer('mf_embedding_user').set_weights(gmf_user_embeddings)
    model.get_layer('mf_embedding_item').set_weights(gmf_item_embeddings)
    
    # MLP embeddings
    mlp_user_embeddings = mlp_model.get_layer('user_embedding').get_weights()
    mlp_item_embeddings = mlp_model.get_layer('item_embedding').get_weights()
    model.get_layer('mlp_embedding_user').set_weights(mlp_user_embeddings)
    model.get_layer('mlp_embedding_item').set_weights(mlp_item_embeddings)
    
    # MLP layers
    for i in range(1, num_layers):
        mlp_layer_weights = mlp_model.get_layer('layer%d' %i).get_weights()
        model.get_layer('layer%d' %i).set_weights(mlp_layer_weights)
        
    # Prediction weights
    gmf_prediction = gmf_model.get_layer('prediction').get_weights()
    mlp_prediction = mlp_model.get_layer('prediction').get_weights()
    new_weights = np.concatenate((gmf_prediction[0], mlp_prediction[0]), axis=0)
    new_b = gmf_prediction[1] + mlp_prediction[1]
    model.get_layer('prediction').set_weights([0.5*new_weights, 0.5*new_b])    
    return model

def get_train_instances(train, num_negatives):
    user_input, item_input, labels = [],[],[]
    num_users = train.shape[0]
    s1 = set(train.keys())
    
    for (u, i) in train.keys():
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u, j) in s1:
                j = np.random.randint(num_items)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

In [7]:
# sys.argv = ['NeuMF.py --dataset champaign_user_item_refined.txt --epochs 20 --batch_size 256 --num_factors 8 \
# --layers [64,32,16,8] --reg_mf 0 --reg_layers [0,0,0,0] --num_neg 4 --lr 0.001 --learner adam --verbose 1 --out 1']

In [8]:
#sys.argv = ['NeuMF.py --dataset ml-1m --epochs 20 --batch_size 256 --num_factors 8 --layers [64,32,16,8] --reg_mf 0 --reg_layers [0,0,0,0] --num_neg 4 --lr 0.001 --learner adam --verbose 1 --out 1']
sys.argv = ['NeuMF.py --dataset ml-1m --epochs 20 --batch_size 256 --num_factors 8 --layers [64,32,16,8] --num_neg 4 --lr 0.001 --learner adam --verbose 1 --out 1 --mf_pretrain Pretrain/ml-1m_GMF_8_1501651698.h5 --mlp_pretrain Pretrain/ml-1m_MLP_[64,32,16,8]_1501652038.h5']

In [9]:
args = parse_args()
num_epochs = args.epochs
batch_size = args.batch_size
mf_dim = args.num_factors
layers = eval(args.layers)
reg_mf = args.reg_mf
reg_layers = eval(args.reg_layers)
num_negatives = args.num_neg
learning_rate = args.lr
learner = args.learner
verbose = args.verbose
mf_pretrain = args.mf_pretrain
mlp_pretrain = args.mlp_pretrain

topK = 10
evaluation_threads = 1 # mp.cpu_count()
print("NeuMF arguments: %s " %(args))
model_out_file = 'Pretrain/%s_NeuMF_%d_%s_%d.h5' %(args.dataset, mf_dim, args.layers, time())

NeuMF arguments: Namespace(batch_size=256, dataset='ml-1m', epochs=100, layers='[64,32,16,8]', learner='adam', lr=0.001, mf_pretrain='', mlp_pretrain='', num_factors=8, num_neg=4, out=1, path='Data/', reg_layers='[0,0,0,0]', reg_mf=0, verbose=1) 


In [60]:
# Loading data

with open('Our_Data/toronto_user_item_refined.txt', 'r') as f:
    dataset = f.readlines()
    
dataset = eval(dataset[0])

train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
num_users, num_items = train.shape
print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d" 
     %(time()-t1, num_users, num_items, train.nnz, len(testRatings)))

{(3, 0): 1.0,
 (5, 0): 1.0,
 (6, 0): 1.0,
 (7, 0): 1.0,
 (10, 0): 1.0,
 (16, 0): 1.0,
 (19, 0): 1.0,
 (23, 0): 1.0,
 (24, 0): 1.0,
 (26, 0): 1.0,
 (28, 0): 1.0,
 (29, 0): 1.0,
 (30, 0): 1.0,
 (37, 0): 1.0,
 (41, 0): 1.0,
 (46, 0): 1.0,
 (51, 0): 1.0,
 (53, 0): 1.0,
 (56, 0): 1.0,
 (57, 0): 1.0,
 (62, 0): 1.0,
 (63, 0): 1.0,
 (69, 0): 1.0,
 (72, 0): 1.0,
 (79, 0): 1.0,
 (83, 0): 1.0,
 (85, 0): 1.0,
 (92, 0): 1.0,
 (95, 0): 1.0,
 (96, 0): 1.0,
 (100, 0): 1.0,
 (112, 0): 1.0,
 (115, 0): 1.0,
 (121, 0): 1.0,
 (123, 0): 1.0,
 (125, 0): 1.0,
 (130, 0): 1.0,
 (131, 0): 1.0,
 (134, 0): 1.0,
 (135, 0): 1.0,
 (139, 0): 1.0,
 (140, 0): 1.0,
 (148, 0): 1.0,
 (151, 0): 1.0,
 (152, 0): 1.0,
 (154, 0): 1.0,
 (156, 0): 1.0,
 (159, 0): 1.0,
 (161, 0): 1.0,
 (162, 0): 1.0,
 (167, 0): 1.0,
 (178, 0): 1.0,
 (180, 0): 1.0,
 (183, 0): 1.0,
 (184, 0): 1.0,
 (185, 0): 1.0,
 (186, 0): 1.0,
 (187, 0): 1.0,
 (188, 0): 1.0,
 (191, 0): 1.0,
 (194, 0): 1.0,
 (201, 0): 1.0,
 (205, 0): 1.0,
 (209, 0): 1.0,
 (214, 0):

In [10]:
# Loading data
t1 = time()
dataset = Dataset(args.path + args.dataset)
dataset
train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
num_users, num_items = train.shape
print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d" 
     %(time()-t1, num_users, num_items, train.nnz, len(testRatings)))

Load data done [15.1 s]. #user=6040, #item=3706, #train=994169, #test=6040


In [113]:
train

<6040x3706 sparse matrix of type '<class 'numpy.float32'>'
	with 994169 stored elements in Dictionary Of Keys format>

In [115]:
len(testNegatives[0]), testNegatives[0]

(99,
 [1064,
  174,
  2791,
  3373,
  269,
  2678,
  1902,
  3641,
  1216,
  915,
  3672,
  2803,
  2344,
  986,
  3217,
  2824,
  2598,
  464,
  2340,
  1952,
  1855,
  1353,
  1547,
  3487,
  3293,
  1541,
  2414,
  2728,
  340,
  1421,
  1963,
  2545,
  972,
  487,
  3463,
  2727,
  1135,
  3135,
  128,
  175,
  2423,
  1974,
  2515,
  3278,
  3079,
  1527,
  2182,
  1018,
  2800,
  1830,
  1539,
  617,
  247,
  3448,
  1699,
  1420,
  2487,
  198,
  811,
  1010,
  1423,
  2840,
  1770,
  881,
  1913,
  1803,
  1734,
  3326,
  1617,
  224,
  3352,
  1869,
  1182,
  1331,
  336,
  2517,
  1721,
  3512,
  3656,
  273,
  1026,
  1991,
  2190,
  998,
  3386,
  3369,
  185,
  2822,
  864,
  2854,
  3067,
  58,
  2551,
  2333,
  2688,
  3703,
  1300,
  1924,
  3118])

In [15]:
testRatings

[[0, 25],
 [1, 133],
 [2, 207],
 [3, 208],
 [4, 222],
 [5, 396],
 [6, 74],
 [7, 91],
 [8, 514],
 [9, 659],
 [10, 820],
 [11, 829],
 [12, 844],
 [13, 105],
 [14, 519],
 [15, 218],
 [16, 246],
 [17, 228],
 [18, 770],
 [19, 641],
 [20, 1167],
 [21, 1195],
 [22, 934],
 [23, 603],
 [24, 872],
 [25, 352],
 [26, 839],
 [27, 1299],
 [28, 1458],
 [29, 1487],
 [30, 274],
 [31, 1521],
 [32, 459],
 [33, 91],
 [34, 1647],
 [35, 1071],
 [36, 808],
 [37, 244],
 [38, 237],
 [39, 1696],
 [40, 303],
 [41, 730],
 [42, 104],
 [43, 472],
 [44, 1772],
 [45, 1805],
 [46, 1624],
 [47, 360],
 [48, 968],
 [49, 1879],
 [50, 1595],
 [51, 788],
 [52, 245],
 [53, 1236],
 [54, 171],
 [55, 534],
 [56, 7],
 [57, 1719],
 [58, 170],
 [59, 488],
 [60, 2063],
 [61, 2119],
 [62, 743],
 [63, 876],
 [64, 693],
 [65, 41],
 [66, 82],
 [67, 19],
 [68, 485],
 [69, 237],
 [70, 317],
 [71, 18],
 [72, 1148],
 [73, 1646],
 [74, 939],
 [75, 267],
 [76, 445],
 [77, 1459],
 [78, 73],
 [79, 123],
 [80, 44],
 [81, 991],
 [82, 1904],
 [83

In [13]:
train

<6040x3706 sparse matrix of type '<class 'numpy.float32'>'
	with 994169 stored elements in Dictionary Of Keys format>

In [14]:
6040*3706

22384240

In [None]:
print(num_users, num_items, mf_dim, layers, reg_layers, reg_mf)
print(type(layers[1]))