In [1]:
import tensorflow as tf
tf.__version__

  from ._conv import register_converters as _register_converters


'1.12.0'

In [2]:
import numpy as np
import os
import random
from collections import defaultdict

In [3]:
def load_data(path:str):
    user_ratings = defaultdict(set)
    max_u_id = -1
    max_i_id = -1
    with open(path,'r') as f:
        for line in f.readlines():
            u,i,_,_ = line.split("\t")
            u = int(u)
            i = int(i)
            user_ratings[u].add(i)
            max_u_id = max(u,max_u_id)
            max_i_id = max(i,max_i_id)
    print("max_u_id:",max_u_id)
    print("max_i_idL",max_i_id)
    return max_u_id,max_i_id,user_ratings

In [4]:
def generate_test(user_ratings):
    """
    对每一个用户u，在user_ratings中随机找到他评分过的一部电影i,保存在user_ratings_test，
    后面构造训练集和测试集需要用到。
    """
    user_test = dict()
    for u,i_list in user_ratings.items():
        user_test[u] = random.sample(user_ratings[u],1)[0]  # [0]  用来取出元素
    return user_test

In [5]:
def generate_train_batch(user_ratings,user_ratings_test,item_count,batch_size=512):
    """
    构造训练用的三元组
    对于随机抽出的用户u，i可以从user_ratings随机抽出，而j也是从总的电影集中随机抽出，当然j必须保证(u,j)不在user_ratings中
    """
    t = []
    for b in range(batch_size):
        u = random.sample(user_ratings.keys(),1)[0]   #  抽出 user 
        i = random.sample(user_ratings[u],1)[0]       #  抽出 u-i 
        while i==user_ratings_test[u]:                #  保证 i 不在 user_ratings 中
            i = random.sample(user_ratings[u],1)[0]

        j = random.randint(1,item_count)   
        while j in user_ratings[u]:                 #  保证 j 不在 user_ratings 中
            j = random.randint(1,item_count)

        t.append([u,i,j])

    return np.asarray(t)

In [6]:
def generate_test_batch(user_ratings,user_ratings_test,item_count):
    """
    对于每个用户u，它的评分电影i是我们在user_ratings_test中随机抽取的，它的j是用户u所有没有评分过的电影集合，
    比如用户u有1000部电影没有评分，那么这里该用户的测试集样本就有1000个
    """
    for u in user_ratings.keys():
        t = []                                
        i = user_ratings_test[u]
        for j in range(1,item_count + 1):
            if not(j in user_ratings[u]):   #  找出 j  用户没有评过分的
                t.append([u,i,j])          
        yield np.asarray(t)


In [7]:
def bpr_mf(user_count,item_count,hidden_dim):
    u = tf.placeholder(tf.int32,[None])
    i = tf.placeholder(tf.int32,[None])
    j = tf.placeholder(tf.int32,[None])

    user_emb_w = tf.get_variable("user_emb_w", [user_count + 1, hidden_dim],
                                 initializer=tf.random_normal_initializer(0, 0.1))
    item_emb_w = tf.get_variable("item_emb_w", [item_count + 1, hidden_dim],
                                 initializer=tf.random_normal_initializer(0, 0.1))

    u_emb = tf.nn.embedding_lookup(user_emb_w, u)
    i_emb = tf.nn.embedding_lookup(item_emb_w, i)
    j_emb = tf.nn.embedding_lookup(item_emb_w, j)

    # MF predict: u_i > u_j
    x = tf.reduce_sum(tf.multiply(u_emb,(i_emb-j_emb)),1,keep_dims=True)

    # average AUC = mean( auc for each user in test set)
    mf_auc = tf.reduce_mean(tf.to_float(x>0))

    l2_norm = tf.add_n([
        tf.reduce_sum(tf.multiply(u_emb, u_emb)),
        tf.reduce_sum(tf.multiply(i_emb, i_emb)),
        tf.reduce_sum(tf.multiply(j_emb, j_emb))
    ])

    regulation_rate = 0.0001
    bprloss = regulation_rate * l2_norm - tf.reduce_mean(tf.log(tf.sigmoid(x)))

    train_op = tf.train.GradientDescentOptimizer(0.01).minimize(bprloss)
    return u, i, j, mf_auc, bprloss, train_op

In [8]:
user_count,item_count,user_ratings = load_data('../dataset/ml-100k/u.data')
user_ratings_test = generate_test(user_ratings)

with tf.Session() as sess:
    u,i,j,mf_auc,bprloss,train_op = bpr_mf(user_count,item_count,20)
    sess.run(tf.global_variables_initializer())   # 初始化变量

    for epoch in range(1,4):
        _batch_bprloss = 0
        
        # train
        for k in range(1,5000):  # batch size = 5000,统一从train set 中进行采样
            uij = generate_train_batch(user_ratings,user_ratings_test,item_count)
            _bprloss,_train_op = sess.run([bprloss,train_op],feed_dict={u:uij[:,0],i:uij[:,1],j:uij[:,2]})

            _batch_bprloss += _bprloss

        print("epoch:",epoch)
        print("bpr_loss:",_batch_bprloss / k)
        print("_train_op")

        user_count = 0
        _auc_sum = 0.0

        # test
        for t_uij in generate_test_batch(user_ratings, user_ratings_test, item_count):
            _auc, _test_bprloss = sess.run([mf_auc, bprloss],feed_dict={u: t_uij[:, 0], i: t_uij[:, 1], j: t_uij[:, 2]})
            user_count += 1
            _auc_sum += _auc
        print("test_loss: ", _test_bprloss, "test_auc: ", _auc_sum / user_count)
        print("")
        
    variable_names = [v.name for v in tf.trainable_variables()]
    values = sess.run(variable_names)
    for k, v in zip(variable_names, values):
        print("Variable: ", k)
        print("Shape: ", v.shape)
       # print(v)

max_u_id: 943
max_i_idL 1682
Instructions for updating:
keep_dims is deprecated, use keepdims instead
epoch: 1
bpr_loss: 0.7243259270493091
_train_op
test_loss:  0.81097615 test_auc:  0.4995024060780599

epoch: 2
bpr_loss: 0.7236929123199899
_train_op
test_loss:  0.80606645 test_auc:  0.49969343140139616

epoch: 3
bpr_loss: 0.7230546190920389
_train_op
test_loss:  0.8015145 test_auc:  0.49979466989351

Variable:  user_emb_w:0
Shape:  (944, 20)
[[-1.1495791e-01 -1.5841755e-04 -1.6849218e-01 ...  3.0700281e-02
  -8.3424732e-02  6.5318316e-02]
 [-4.0012114e-02 -6.0872216e-04 -2.1939857e-02 ... -3.9159764e-02
   1.8236209e-02  1.4607744e-01]
 [-1.5655212e-01  3.7949439e-02  8.3874315e-02 ...  1.3156262e-01
   8.1651181e-02  8.2156524e-02]
 ...
 [-4.6532106e-02  1.3981578e-01 -1.1642193e-01 ...  4.6231006e-03
  -4.2007055e-02 -9.1899961e-02]
 [ 3.5480317e-02  6.9339819e-02  9.2969844e-05 ... -1.0023547e-01
  -1.5890995e-01 -4.0201873e-02]
 [-4.9461238e-02 -1.2933241e-01  4.0046796e-02 ...  

In [12]:
values[0][0]  # 第一个user的  user_emb_w

array([-1.1495791e-01, -1.5841755e-04, -1.6849218e-01, -1.7514817e-01,
        1.3888405e-01, -5.2612860e-02, -1.4011137e-01,  4.8302446e-02,
        8.5238643e-02,  5.2321006e-02, -3.7560042e-02,  2.3625390e-01,
        1.5694846e-01,  1.2698776e-01, -4.0184390e-03, -8.5913189e-02,
       -1.2684788e-01,  3.0700281e-02, -8.3424732e-02,  6.5318316e-02],
      dtype=float32)

In [15]:
np.array(values[0][0]).shape

(20,)

In [18]:
np.array(values[1]).shape

(1683, 20)

In [19]:
p

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [35]:

#  0号用户对这个用户对所有电影的预测评分
session1 = tf.Session()
u1_dim = tf.expand_dims(values[0][0], 0)  # 变为 (1,20)
u1_all = tf.matmul(u1_dim, values[1],transpose_b=True)  # (1,20) X (20,1683) = (1,1683)
result_1 = session1.run(u1_all)  # 第一个user 对所有用户的评分
print (result_1)

print("以下是给用户0的推荐：")
p = np.squeeze(result_1)  # 从数组的形状中删除单维条目，即把shape中为1的维度去掉
p[np.argsort(p)[:-5]] = 0   # 如果存后面取的话，就拿不到index了
for index in range(len(p)):
    if p[index] != 0:
        print (index, p[index])

[[ 0.04271021  0.07925439  0.04533527 ... -0.01799173 -0.06918238
  -0.0059652 ]]
以下是给用户0的推荐：
76 0.17093994
248 0.13948044
989 0.14081037
1493 0.14552039
1662 0.16801311
