In [2]:
from datetime import datetime
import tensorflow as tf
import numpy as np
import pandas as pd
import surprise
from sklearn import model_selection

#### LFM：把用户再item上打分的行为，看作是有内部依据的，认为和k个factor有关系
#### 每一个user i会有一个用户的向量(k维)，每一个item会有一个item的向量(k维)

![](1.png)

In [3]:
np.random.seed(0)

In [4]:
# 下面以movielens为例
col_names = ["user", "item", "rate", "st"]
df = pd.read_csv('~/.surprise_data/ml-1m/ml-1m/ratings.dat', sep='::', 
                 engine='python', header=None, names=col_names, 
                 dtype={'user': np.int32, 'item': np.int32, 'st': np.int64, 'rate': np.float32})

# 用户id和item的id都减1, 因为要比最大的id小1
df['user'] = df['user'] - 1
df['item'] = df['item'] - 1
df.head()

Unnamed: 0,user,item,rate,st
0,0,1192,5.0,978300760
1,0,660,3.0,978302109
2,0,913,3.0,978301968
3,0,3407,4.0,978300275
4,0,2354,5.0,978824291


In [5]:
# 总共6040个用户 3706部电影
df.shape, len(df.user.unique()), len(df.item.unique())

((1000209, 4), 6040, 3706)

In [6]:
df = df.iloc[:, :-1]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 3 columns):
user    1000209 non-null int64
item    1000209 non-null int64
rate    1000209 non-null float32
dtypes: float32(1), int64(2)
memory usage: 19.1 MB


In [7]:
# 划分训练集和测试集 
df_train, df_test = model_selection.train_test_split(df, test_size=0.2)
# df_train, df_val = model_selection.train_test_split(df, test_size=0.2)

In [8]:
df_train.shape, df_test.shape

((800167, 3), (200042, 3))

In [9]:
df.item.max(), df.user.max()

(3951, 6039)

In [10]:
# tf.convert_to_tensor(df_train)
db_train = tf.data.Dataset.from_tensor_slices(df_train.values)
# db_val = tf.data.Dataset.from_tensor_slices(df_val.values)
db_test = tf.data.Dataset.from_tensor_slices(df_test.values)

batch_size = 512
db_train = db_train.shuffle(10000).batch(batch_size, drop_remainder=True)
# db_val = db_val.shuffle(10000).batch(batch_size,  drop_remainder=True)
db_test = db_test.batch(batch_size)

In [11]:
# 创建输出日志 用于可视化
current_time = datetime.now().strftime('%Y%m%d-%H%M%S')
log_dir = 'logs/' + current_time
summary_writer = tf.summary.create_file_writer(logdir=log_dir)

In [12]:
# 用户数
user_total_num = 6040
# 电影id最大的
item_total_num = 3952
# LFM的factor维度
DIM = 15
# 使用cpu做训练 如果你的电脑支持GPU的话,还是使用GPU来训练,运算效率上会有好几十倍的提升
DEVICE = "/cpu:0"

# 对照最上面的图片,初始化bais变量
# 全局的偏差变量
global_bias = tf.Variable([0.], name='global_bias', dtype=tf.float32)
# user的偏移变量
w_bias_user = tf.Variable([0.] * user_total_num, name='w_bias_user', dtype=tf.float32)
# item的偏移变量
w_bias_item = tf.Variable([0.] * item_total_num, name='w_bias_item', dtype=tf.float32)

w_user = tf.Variable(tf.random.truncated_normal([user_total_num, DIM], stddev=0.02), name="w_user", dtype=tf.float32)
w_item = tf.Variable(tf.random.truncated_normal([item_total_num, DIM], stddev=0.02), name="w_item", dtype=tf.float32)



In [39]:
# 正则项的惩罚系数
reg_labda = 0.1
optimizer = tf.optimizers.Adam(learning_rate=0.001)
n_epoch = 100
for epoch in range(n_epoch):
    # 开始遍历训练接 批次迭代
    for step, item in enumerate(db_train):
        with tf.GradientTape() as tap:
            # 找到这个batch中用户和item
            bias_user = tf.nn.embedding_lookup(w_bias_user, tf.cast(item[:, 0], dtype=tf.int32), name="bias_user")
            bias_item = tf.nn.embedding_lookup(w_bias_item, tf.cast(item[:, 1], dtype=tf.int32), name="bias_item")

            embd_user = tf.nn.embedding_lookup(w_user, tf.cast(item[:, 0], dtype=tf.int32), name="embedding_user")
            embd_item = tf.nn.embedding_lookup(w_item, tf.cast(item[:, 1], dtype=tf.int32), name="embedding_item")

            # 按照实际公式进行计算
            # 先对user向量和item向量求内积
            y_pred = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
            # 加上几个偏置项
            y_pred = tf.add(y_pred, global_bias)
            y_pred = tf.add(y_pred, bias_user)
            y_pred = tf.add(y_pred, bias_item, name="svd_inference")
            # 加上正则化项 
            regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), name="svd_regularizer")

            # loss function 下面是损失函数 相当于 MSE + 正则项
            # l2_loss 输出值 output = sum(t ** 2) / 2

            cost_l2 = tf.nn.l2_loss(tf.subtract(y_pred, tf.cast(item[:, 2], dtype=tf.float32)))
            penalty = tf.constant(reg_labda, dtype=tf.float32, shape=[], name="l2")
            loss = tf.add(cost_l2, tf.multiply(regularizer, penalty))

        # 对参数球梯度 更新参数
        grads = tap.gradient(loss, [global_bias, w_bias_user, w_bias_item, w_user, w_item])
        optimizer.apply_gradients(zip(grads, [global_bias, w_bias_user, w_bias_item, w_user, w_item]))

        if step % 200 == 0:
            print(loss)
    
#     # 最后一次开始测试
#     if epoch == n_epoch - 1:
#         for x in db_test:
            


tf.Tensor(194.45468, shape=(), dtype=float32)
tf.Tensor(202.42969, shape=(), dtype=float32)
tf.Tensor(197.0939, shape=(), dtype=float32)
tf.Tensor(201.36182, shape=(), dtype=float32)
tf.Tensor(215.61707, shape=(), dtype=float32)
tf.Tensor(201.03453, shape=(), dtype=float32)
tf.Tensor(195.24013, shape=(), dtype=float32)
tf.Tensor(216.7022, shape=(), dtype=float32)
tf.Tensor(192.96458, shape=(), dtype=float32)
tf.Tensor(201.68907, shape=(), dtype=float32)
tf.Tensor(196.29425, shape=(), dtype=float32)
tf.Tensor(201.40315, shape=(), dtype=float32)
tf.Tensor(214.2475, shape=(), dtype=float32)
tf.Tensor(200.20401, shape=(), dtype=float32)
tf.Tensor(194.85258, shape=(), dtype=float32)
tf.Tensor(215.97609, shape=(), dtype=float32)
tf.Tensor(192.53879, shape=(), dtype=float32)
tf.Tensor(201.09076, shape=(), dtype=float32)
tf.Tensor(195.48354, shape=(), dtype=float32)
tf.Tensor(201.52338, shape=(), dtype=float32)
tf.Tensor(213.06607, shape=(), dtype=float32)
tf.Tensor(199.53792, shape=(), dtype=

In [23]:
for x in db_test.take(1):
   # 找到这个batch中用户和item
    bias_user = tf.nn.embedding_lookup(w_bias_user, tf.cast(item[:, 0], dtype=tf.int32), name="bias_user")
    bias_item = tf.nn.embedding_lookup(w_bias_item, tf.cast(item[:, 1], dtype=tf.int32), name="bias_item")

    embd_user = tf.nn.embedding_lookup(w_user, tf.cast(item[:, 0], dtype=tf.int32), name="embedding_user")
    embd_item = tf.nn.embedding_lookup(w_item, tf.cast(item[:, 1], dtype=tf.int32), name="embedding_item")

    # 按照实际公式进行计算
    # 先对user向量和item向量求内积
    y_pred = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
    # 加上几个偏置项
    y_pred = tf.add(y_pred, global_bias)
    y_pred = tf.add(y_pred, bias_user)
    y_pred = tf.add(y_pred, bias_item, name="svd_inference")

In [38]:
df_test = pd.DataFrame(x.numpy())
df_test['pred'] = y_pred
df_test.head(20)

Unnamed: 0,0,1,2,pred
0,1921.0,2093.0,4.0,2.583986
1,4917.0,2807.0,1.0,3.199754
2,956.0,1659.0,4.0,2.89344
3,4652.0,913.0,5.0,4.639668
4,3244.0,3323.0,1.0,3.926378
5,5090.0,587.0,5.0,1.984482
6,703.0,968.0,3.0,3.420414
7,3939.0,3782.0,4.0,4.269173
8,2242.0,909.0,5.0,3.856988
9,2418.0,3273.0,2.0,2.8545
