In [1]:
import numpy as np
import theano.tensor as T
import keras
import pandas as pd
import numpy as np
import scipy.sparse as sp
import sklearn
import tensorflow as tf
from keras import backend as K
from keras import initializers
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Reshape, Flatten
from keras.layers import Multiply
from keras.layers import Concatenate
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from keras.regularizers import l2
# from Dataset import Dataset
# from evaluate import evaluate_model
from time import time
import multiprocessing as mp
import sys
import math
import argparse

import heapq # for retrieval topK

import itertools

Using TensorFlow backend.


In [3]:
# to show multiple results in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
class Dataset(object):
    '''
    classdocs
    '''

    def __init__(self, path):
        '''
        Constructor
        '''
        self.trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating.csv")
        self.testRatings = self.load_rating_file_as_list(path + ".test.rating.csv")
        self.testNegatives = self.load_negative_file(path + ".test.negative.csv")
        assert len(self.testRatings) == len(self.testNegatives)
        
        self.num_users, self.num_items = self.trainMatrix.shape
        
    def load_rating_file_as_list(self, filename):
        ratingList = []
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split(",")
                user, item = int(arr[0]), int(arr[1])
                ratingList.append([user, item])
                line = f.readline()
        return ratingList
    
    def load_negative_file(self, filename):
        negativeList = []
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                negatives = []
                for x in arr[1: ]:
                    negatives.append(int(x))
                negativeList.append(negatives)
                line = f.readline()
        return negativeList
    
    def load_rating_file_as_matrix(self, filename):
        '''
        Read .rating file and Return dok matrix.
        The first line of .rating file is: num_users\t num_items
        '''
        # Get number of users and items
        num_users, num_items = 0, 0
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split(",")
                u, i = int(arr[0]), int(arr[1])
                num_users = max(num_users, u)
                num_items = max(num_items, i)
                line = f.readline()
        # Construct matrix
        mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split(",")
                user, item = int(arr[0]), int(arr[1])
                rating = 1
                if (rating > 0):
                    mat[user, item] = 1.0
                line = f.readline()    
        return mat

In [5]:
# Global variables that are shared across processes
_model = None
_testRatings = None
_testNegatives = None
_K = None

def evaluate_model(model, testRatings, testNegatives, K, num_thread):
    """
    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
    Return: score of each test rating.
    """
    global _model
    global _testRatings
    global _testNegatives
    global _K
    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _K = K
        
    hits, ndcgs = [],[]
    if(num_thread > 1): # Multi-thread
        pool = multiprocessing.Pool(processes=num_thread)
        res = pool.map(eval_one_rating, range(len(_testRatings)))
        pool.close()
        pool.join()
        hits = [r[0] for r in res]
        ndcgs = [r[1] for r in res]
        return (hits, ndcgs)
    # Single thread
    for idx in range(len(_testRatings)):
        (hr,ndcg) = eval_one_rating(idx)
        hits.append(hr)
        ndcgs.append(ndcg)      
    return (hits, ndcgs)
  
  
def eval_one_rating(idx):
    rating = _testRatings[idx]
    items = _testNegatives[idx]
    u = rating[0]
    gtItem = rating[1]
    items.append(gtItem)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = _model.predict([users, np.array(items)], 
                                 batch_size=100, verbose=0)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

In [6]:
from keras import backend as K

def init_normal(shape, dtype=None):
    return K.random_normal(shape, dtype=dtype)

In [45]:
# def init_normal(shape, name=None):
#     return initializers.normal(shape, scale=0.01, name=name)

In [46]:
def get_model(num_users, num_items, latent_dim, regs=[0,0]):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim,
                                  embeddings_initializer = 'uniform', embeddings_regularizer = l2(regs[0]), input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim,
                                  embeddings_initializer = 'uniform', embeddings_regularizer = l2(regs[1]), input_length=1)   
    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MF_Embedding_User(user_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))
    
    # Element-wise product of user and item embeddings 
    # predict_vector = merge([user_latent, item_latent], mode = 'mul')
    predict_vector = Multiply()([user_latent, item_latent])
    # Final prediction layer
    #prediction = Lambda(lambda x: K.sigmoid(K.sum(x)), output_shape=(1,))(predict_vector)
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform')(predict_vector)
    ## prediction is the output of the Dense layer, which has 1D dimension
    model = Model(input=[user_input, item_input], 
                output=prediction)

    return model

In [92]:
user_input[0:10]
item_input[0:10]

[9231, 9231, 9231, 9231, 9231, 9231, 9231, 9231, 9231, 9231]

[488, 838, 1422, 1292, 429, 1223, 386, 1328, 236, 179]

In [172]:
item_input.uniques()

AttributeError: 'list' object has no attribute 'uniques'

In [47]:
def get_train_instances(train, num_negatives):
    user_input, item_input, labels = [],[],[]
    num_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u, j) in train:
                j = np.random.randint(num_items)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

In [7]:
path = "/Users/xun/Documents/Thesis/dataset/ks-3m"
# dataset = 'ml-1m'
num_factors = 8 # number of lalent factors for user_input and itme_input
regs = [0,0]
num_negatives = 4
learner = 'adam'
learning_rate = 0.001
epochs = 10
batch_size = 256
verbose = 1
out = 1
    
topK = 10
evaluation_threads = 1 #mp.cpu_count()
# print("GMF arguments: %s" %(args))
# model_out_file = '/Users/xun/Documents/Thesis/Pretrain/GMF_%d_%d.h5' %(num_factors, time())

In [8]:
# df = pd.read_csv('/content/drive/My Drive/RS/ml-1m.test.negative.csv')

# f = open('/content/drive/My Drive/RS/ml-1m.test.negative.csv', 'r')
# ds = pd.read_csv('/Users/xun/Documents/Thesis/ml-1m.test.negative.csv')
# ds = open('/Users/xun/Documents/Thesis/ml-1m.test.negative.csv', 'r')

In [9]:
# path = '/content/drive/My Drive/RS/ml-1m'
# f = open(path, 'r')

In [10]:
dataset = Dataset(path)

In [11]:
# Loading data
t1 = time()
dataset = Dataset(path)
train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
#     train = pd.read_csv('ml-1m.train.rating.csv')
#     testRatings= pd.read_csv('ml-1m.test.rating.csv')
#     testNegatives = pd.read_csv('ml-1m.test.negative.csv')
num_users, num_items = train.shape
print("Load data done [%.1f s]. #user=%d, #item=%d,#test=%d" 
          %(time()-t1, num_users, num_items, len(testRatings)))

Load data done [11.3 s]. #user=25800, #item=1486,#test=25800


In [77]:
len(testNegatives[0])

100

In [80]:
# Build model
model = get_model(num_users, num_items, num_factors, regs)
if learner.lower() == "adagrad": 
    model.compile(optimizer=Adagrad(lr=learning_rate), loss='binary_crossentropy')
elif learner.lower() == "rmsprop":
    model.compile(optimizer=RMSprop(lr=learning_rate), loss='binary_crossentropy')
elif learner.lower() == "adam":
    model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy')
else:
    model.compile(optimizer=SGD(lr=learning_rate), loss='binary_crossentropy')
#print(model.summary())

# Init performance
t1 = time()
(hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
#mf_embedding_norm = np.linalg.norm(model.get_layer('user_embedding').get_weights())+np.linalg.norm(model.get_layer('item_embedding').get_weights())
#p_norm = np.linalg.norm(model.get_layer('prediction').get_weights()[0])
print('Init: HR = %.4f, NDCG = %.4f\t [%.1f s]' % (hr, ndcg, time()-t1))



Init: HR = 0.0965, NDCG = 0.0438	 [20.7 s]


In [82]:
# Train model
best_hr, best_ndcg, best_iter = hr, ndcg, -1
for epoch in range(epochs):
    t1 = time()
    # Generate training instances
    user_input, item_input, labels = get_train_instances(train, num_negatives)

    # Training
    hist = model.fit([np.array(user_input), np.array(item_input)], #input
                     np.array(labels), # labels 
                     batch_size=batch_size, epochs=epochs, verbose=0, shuffle=True)
    t2 = time()

    # Evaluation
    if epoch %verbose == 0:
        (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
        hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), hist.history['loss'][0]
        print('Iteration %d [%.1f s]: HR = %.4f, NDCG = %.4f, loss = %.4f [%.1f s]' 
              % (epoch,  t2-t1, hr, ndcg, loss, time()-t2))
        if hr > best_hr:
            best_hr, best_ndcg, best_iter = hr, ndcg, epoch
#                 if out > 0:
#                     model.save_weights(model_out_file, overwrite=True)

print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %(best_iter, best_hr, best_ndcg))


Instructions for updating:
Use tf.cast instead.
Iteration 0 [259.5 s]: HR = 0.4821, NDCG = 0.2605, loss = 0.4354 [20.2 s]
Iteration 1 [254.2 s]: HR = 0.4863, NDCG = 0.2656, loss = 0.3289 [21.0 s]


KeyboardInterrupt: 

In [14]:
best_hr, best_ndcg, best_iter

NameError: name 'best_hr' is not defined

In [12]:
# model.save('/Users/xun/Documents/Thesis/model/ks-3m-GMF.h5')
model = load_model('/Users/xun/Documents/Thesis/model/ks-3m-GMF.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [13]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 8)         206400      user_input[0][0]                 
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 8)         11888       item_input[0][0]                 
__________________________________________________________________________________________________
flatten_5 

In [16]:
(hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
print('HR = %.4f, NDCG = %.4f' 
% (hr, ndcg))

HR = 0.4898, NDCG = 0.2682


In [85]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x14d22edd8>,
 <keras.engine.input_layer.InputLayer at 0x14d22e400>,
 <keras.layers.embeddings.Embedding at 0x13b413cf8>,
 <keras.layers.embeddings.Embedding at 0x13b49a898>,
 <keras.layers.core.Flatten at 0x13b49ab00>,
 <keras.layers.core.Flatten at 0x13b49a828>,
 <keras.layers.merge.Multiply at 0x14d2363c8>,
 <keras.layers.core.Dense at 0x14d2364e0>]

In [86]:
model.get_weights()

[array([[ 0.5915518 ,  0.4114995 , -0.47339424, ..., -1.4496088 ,
          0.56494606,  1.1425885 ],
        [-0.81193525,  0.3437835 , -0.82701015, ..., -1.0681827 ,
          0.9869289 ,  1.628961  ],
        [-0.7600812 ,  0.4130647 , -0.9922285 , ..., -1.662772  ,
          0.41050148,  2.2316833 ],
        ...,
        [ 1.5950006 ,  1.3446106 , -1.0848769 , ..., -1.9834437 ,
          0.08011746,  1.7294837 ],
        [-0.9633407 ,  0.42373487, -0.5315732 , ..., -3.1593897 ,
          0.77586746,  0.87524784],
        [ 0.61812735,  0.8683222 , -1.1313384 , ..., -0.855194  ,
          0.25831667,  0.81883895]], dtype=float32),
 array([[ 0.04756297,  0.754279  ,  0.17979054, ..., -0.33292708,
          0.53909546, -1.6798451 ],
        [-0.12174495,  0.3850079 ,  0.544522  , ...,  0.37971583,
         -1.2514222 , -0.14402358],
        [-0.56476206,  0.50073075, -0.07830757, ..., -0.7146265 ,
         -2.1521273 ,  0.17786792],
        ...,
        [-0.59031785, -0.8555809 , -0.8

In [87]:
a = len(model.get_weights())
a

4

In [88]:
for i in range(4):
   print(model.get_weights()[i].shape)

(25800, 8)
(1486, 8)
(8, 1)
(1,)


In [None]:
for i in range(8):
    a = model.layers[i].get_weights()
    print(a)
# only embedding layer, 
# dense layer(1 is the multiply weight,1 is the bias weight) ) have weights

In [106]:
len(user_input) # 994169(# of 1.0) * 5 = 4970845
len(item_input) #~664977 three month interactions of positive elements 
len(labels)
len(set(user_input))
len(set(item_input))

3479790

3479790

3479790

In [90]:
train.shape
len(train.keys())# get the index of non-zero items
len(train.nonzero()[0])

(25800, 1486)

In [None]:
embeddings = model.get_weights()[0]
embeddings[0]

In [104]:
# for layer in model.layers: print(layer.name, layer.get_weights(),len(layer.get_weights()))

user_input [] 0
item_input [] 0
embedding_5 [array([[ 0.5915518 ,  0.4114995 , -0.47339424, ..., -1.4496088 ,
         0.56494606,  1.1425885 ],
       [-0.81193525,  0.3437835 , -0.82701015, ..., -1.0681827 ,
         0.9869289 ,  1.628961  ],
       [-0.7600812 ,  0.4130647 , -0.9922285 , ..., -1.662772  ,
         0.41050148,  2.2316833 ],
       ...,
       [ 1.5950006 ,  1.3446106 , -1.0848769 , ..., -1.9834437 ,
         0.08011746,  1.7294837 ],
       [-0.9633407 ,  0.42373487, -0.5315732 , ..., -3.1593897 ,
         0.77586746,  0.87524784],
       [ 0.61812735,  0.8683222 , -1.1313384 , ..., -0.855194  ,
         0.25831667,  0.81883895]], dtype=float32)] 1
embedding_6 [array([[ 0.04756297,  0.754279  ,  0.17979054, ..., -0.33292708,
         0.53909546, -1.6798451 ],
       [-0.12174495,  0.3850079 ,  0.544522  , ...,  0.37971583,
        -1.2514222 , -0.14402358],
       [-0.56476206,  0.50073075, -0.07830757, ..., -0.7146265 ,
        -2.1521273 ,  0.17786792],
       ...,

In [140]:
# user_input[0:1]
# item_input[0:1]
s = model.predict([[20312,20312, 20312, 20312], [925, 924, 923, 922]])
s_flatten = [item for sublist in s for item in sublist]
print(s_flatten)

[0.23075183, 0.004147677, 0.20985295, 0.0019679824]


In [152]:
s

array([[0.23075183],
       [0.00414768],
       [0.20985295],
       [0.00196798]], dtype=float32)

In [148]:
ids = np.argpartition(s_flatten, -2)[-count:]

In [153]:
ids

array([2, 0])

In [150]:
s_flatten[ids]

TypeError: only integer scalar arrays can be converted to a scalar index

In [154]:
s_array = np.asarray(s_flatten)
s_array[ids]

array([0.20985295, 0.23075183], dtype=float32)

In [161]:
list(zip(ids, s_array[ids]))

[(2, 0.20985295), (0, 0.23075183)]

In [155]:
best = sorted(zip(ids, s_array[ids]), key=lambda x: -x[1])
best

[(0, 0.23075183), (2, 0.20985295)]

In [162]:
x = (0, 0.23075183)
-x[1]

-0.23075183

In [159]:
y = [2,3,4,8]
sorted(y, key=lambda x: -x[1])

TypeError: 'int' object is not subscriptable

In [127]:
# list(zip(train.nonzero()[0], train.nonzero()[1]))
# train[9231].indices
# train.items()
# train.get((20312,9231), default=0.0)
# train_csr = train.tocsr()
train_csr[9231].indices


array([ 488, 1223, 1414,  431,  175,  560,  470,   82,  951, 1284,  994,
        129], dtype=int32)

In [185]:
def recommend(uid,item_list,train, model, N = 10, filter_already_liked_items=True):
    item_list.sort()
    train_csr = train.tocsr()
    num_items = len(item_list)
    user_input = [uid for i in range(num_items)]
    
    if filter_already_liked_items is True:
        liked = set(train_csr[uid].indices)
    else:
        liked = set()
    
    scores_l = model.predict([user_input, item_list])
    scores = np.asarray([item for sublist in scores_l for item in sublist])
    count = N + len(liked)
    if count < len(scores):
        ids = np.argpartition(scores, -count)[-count:]
        best = sorted(zip(ids, scores[ids]), key=lambda x: -x[1])
    else:
        best = sorted(enumerate(scores), key=lambda x: -x[1])
    return list(itertools.islice((rec for rec in best if rec[0] not in liked), N))
    

In [20]:
item_list = list(set(item_input))
recommended = recommend(20312,item_list,train, model, N= 10 )

NameError: name 'item_input' is not defined

In [19]:
cids = [x[0] for x in recommended]

NameError: name 'recommended' is not defined

In [18]:
cids

NameError: name 'cids' is not defined

In [17]:
[idx_to_cid[x] for x in cids]

NameError: name 'cids' is not defined

In [2]:
model = load_model('/Users/xun/Documents/Thesis/model/ks-3m-v2-rnn.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [4]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 8)             11888     
_________________________________________________________________
masking_1 (Masking)          (None, 30, 8)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 3)                 144       
_________________________________________________________________
dense_1 (Dense)              (None, 1486)              5944      
_________________________________________________________________
activation_1 (Activation)    (None, 1486)              0         
Total params: 17,976
Trainable params: 17,976
Non-trainable params: 0
_________________________________________________________________
