In [2]:
# 基于TensorFlow的隐语义模型推荐
# 数据：3900个电影；6040个用户

# Evaluate train times per epoch
import time
# Imports for data io operations
from collections import deque

import numpy as np
# Main imports for training
import tensorflow as tf

import tensorflows.readers as readers


In [3]:
# Constant seed for replicating training results
# 指定随机种子（按一种随机方式执行），方便进行场景复现
np.random.seed(42)

# Number of users in the dataset
u_num = 6040
# Number of movies in the dataset
i_num = 3952
# Number of samples per batch
# 迭代更新时的批次大小
batch_size = 1000
# Dimensions of the data, 15
# 隐含因子的维度：6040*5 * 5*3952 = 6040*3952
dims = 5
# Number of times the network sees all the training data
# 迭代次数
max_epochs = 50

# Device used for all computations
place_device = "/cpu:0"


In [4]:
df = readers.read_file("/Users/wugang/code/python/rec-demo/data/ml-1m/ratings.dat", sep="::")
rows = len(df)
df

Unnamed: 0,user,item,rate,st
0,0.0,1192.0,5.0,978300760
1,0.0,660.0,3.0,978302109
2,0.0,913.0,3.0,978301968
3,0.0,3407.0,4.0,978300275
4,0.0,2354.0,5.0,978824291
5,0.0,1196.0,3.0,978302268
6,0.0,1286.0,5.0,978302039
7,0.0,2803.0,5.0,978300719
8,0.0,593.0,4.0,978302268
9,0.0,918.0,4.0,978301368


In [5]:
def get_data():
    """
    Reads file using the demiliter :: form the ratings file
    Columns are user ID, item ID, rating, and timestamp
    Sample data - 3::1196::4::978297539
    :return: 
    """
    df = readers.read_file('/Users/wugang/code/python/rec-demo/data/ml-1m/ratings.dat', sep="::")
    rows = len(df)
    # 基于位置选择的纯整数位置索引
    # 防止一个batch中都是一个用户，对数据进行乱序，也就是重排序
    # numpy.random,permutation(x)是返回一个被洗牌过的array
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    # Separate data into train and test, 90% for train and 10% for test
    split_index = int(rows * 0.9)
    # Use indices to separate the data
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)

    return df_train, df_test

In [6]:
def clip(x):
    """
    使数组或列表x中的值分布在1～5之间。
    np.clip截取函数, 将范围外的数强制转化为范围内的数.
    def clip(a, a_min, a_max, out=None): 将数组a中的所有数限定到范围a_min和a_max中，
    即az中所有比a_min小的数都会强制变为a_min，a中所有比a_max大的数都会强制变为a_max.
    :param x: 
    :return: 
    """
    return np.clip(x, 1.0, 5.0)

In [7]:
def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
    """
    
    :param user_batch: 
    :param item_batch: 
    :param user_num: 
    :param item_num: 
    :param dim: 矩阵的维度；
    :param device: 训练配置，默认为cpu执行
    :return: 
    """
    with tf.device('/cpu:0'):
        # 指定命名域。防止在notebook中重复执行代码报错。
        # with tf.variable_scope('lsi', reuse=True):
            # Using a global bias term
            # 偏置项
            bias_global = tf.get_variable("bias_global", shape=[])
            # User and item bias variables
            # get_variable: Prefixes the name with the current variable scope 
            # and performs reuse checks.
            w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
            w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
            # embedding_lookup: Looks up 'ids' in a list of embedding tensors
            # Bias embeddings for user and items, given a batch
            bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
            bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
            # User and item weight variables
            # 权重（N*dim * dim*M = N*M），并使用标准差进行初始化
            w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))
            w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))
            # Weight embeddings for user and items, given a batch
            embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
            embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
            
    with tf.device(device):
        # infer 最终结果值
        # reduce_sum: Computes the sum of elements across dimensions of a tensor
        infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
        infer = tf.add(infer, bias_global)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name="svd_inference")
        # l2_loss: Computes half the L2 norm of a tensor without the sqrt
        # 正则化惩罚项矩阵
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), 
                             name="svd_regularizer")
    return infer, regularizer

In [8]:
def loss(infer, regularizer, rate_batch, learn_rate=0.01, reg=0.1, device="/cpu:0"):
    """
    
    :param infer: 预测最终结果值
    :param regularizer: 正则化惩罚项矩阵
    :param rate_batch: 真实值
    :param learn_rate: 梯度下降时的学习率
    :param reg: 正则化惩罚项，惩罚力度
    :param device: 运行环境配置
    :return: 
    """
    with tf.device(device):
        # Use L2 loss to compute penalty
        # 预测值和真实值之间的差异
        cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
        cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
        # 'Follow the Regularized Leader' optimizer
        # 梯度下降优化器
        train_op = tf.train.GradientDescentOptimizer(learning_rate=learn_rate).minimize(cost)
    return cost, train_op

In [10]:
# Read data from ratings file to build a TF model
df_train, df_test = get_data()

samples_per_batch = len(df_train) // batch_size
print(samples_per_batch)
print("Number of train samples %d, test samples %d, samples per batch %d" % 
      (len(df_train), len(df_test), samples_per_batch))

900
Number of train samples 900188, test samples 100021, samples per batch 900


In [11]:
# Peeking at the top 5 user values
print(df_train["user"].head()) 
print(df_test["user"].head())

0    1834.0
1    5836.0
2    1266.0
3    2468.0
4     117.0
Name: user, dtype: float32
0    5062.0
1     251.0
2    5831.0
3    2243.0
4    4903.0
Name: user, dtype: float32


In [12]:
# Peeking at the top 5 item values
print(df_train["item"].head())
print(df_test["item"].head())

0    1213.0
1     995.0
2     355.0
3    2040.0
4    2670.0
Name: item, dtype: float32
0    2917.0
1     291.0
2    2027.0
3    2310.0
4    1930.0
Name: item, dtype: float32


In [13]:
# Peeking at the top 5 rate values
print(df_train["rate"].head())
print(df_test["rate"].head())

0    5.0
1    4.0
2    2.0
3    5.0
4    4.0
Name: rate, dtype: float32
0    5.0
1    4.0
2    4.0
3    3.0
4    5.0
Name: rate, dtype: float32


In [15]:
# Using a shuffle iterator to generate random batches, for training
# 迭代训练
# 利用ShuffleIterator取出指定的字段信息，并根据batch_size进行数据洗牌
iter_train = readers.ShuffleIterator([df_train["user"],
                                      df_train["item"],
                                      df_train["rate"]],
                                     batch_size=batch_size)

# Sequentially generate one-epoch batches, for testing
# 指定一个epoch进行迭代测试
iter_test = readers.OneEpochIterator([df_test["user"],
                                      df_test["item"],
                                      df_test["rate"]],
                                     batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
_, train_op = loss(infer, regularizer, rate_batch, learn_rate=0.0010, reg=0.05, device=place_device)


ValueError: Variable bias_global already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "<ipython-input-7-bb5c25d0ae9c>", line 17, in model
    bias_global = tf.get_variable("bias_global", shape=[])
  File "<ipython-input-14-809d6edb3742>", line 20, in <module>
    infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
  File "/Users/wugang/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)


In [75]:
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
    errors = deque(maxlen=samples_per_batch)
    start = time.time()
    for i in range(max_epochs * samples_per_batch):
        users, items, rates = next(iter_train)
        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                               item_batch: items, rate_batch: rates})
        pred_batch = clip(pred_batch)
        errors.append(np.power(pred_batch - rates, 2))
        if i % samples_per_batch == 0:
            train_err = np.sqrt(np.mean(errors))
            test_err2 = np.array([])
            for users, items, rates in iter_test:
                pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                        item_batch: items})
                pred_batch = clip(pred_batch)
                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
            end = time.time()
            
            print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
            start = end

    saver.save(sess, './save/')

Epoch	Train Error	Val Error	Elapsed Time


InvalidArgumentError: You must feed a value for placeholder tensor 'id_user_4' with dtype int32 and shape [?]
	 [[Node: id_user_4 = Placeholder[dtype=DT_INT32, shape=[?], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'id_user_4', defined at:
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.init_path()
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/asyncio/base_events.py", line 1425, in _run_once
    handle._run()
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/asyncio/events.py", line 127, in _run
    self._callback(*self._args)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/tornado/platform/asyncio.py", line 122, in _handle_events
    handler_func(fileobj, events)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    self.saved_sigint_handler = signal(SIGINT, default_int_handler)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    except:
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    def start(self):
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2901, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-46-e6432ba5c6d3>", line 16, in <module>
    user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 1735, in placeholder
    return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 4925, in placeholder
    "Placeholder", dtype=dtype, shape=shape, name=name)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/tensorflow/python/util/deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3155, in create_op
    op_def=op_def)
  File "/Users/wugang/env/anaconda3/python.app/Contents/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'id_user_4' with dtype int32 and shape [?]
	 [[Node: id_user_4 = Placeholder[dtype=DT_INT32, shape=[?], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
