In [1]:
from surprise import BaselineOnly
from surprise import accuracy
from surprise import Dataset
from surprise.model_selection import train_test_split

In [2]:
data = Dataset.load_builtin("ml-100k")
trainset, testset = train_test_split(data, train_size=.8, random_state=10, shuffle=True)

该baseline算法最小化以下正则化平方误差
$$
\sum_{r_{ui}\in{R_{train}}}(r_{ui}−(\mu+b_u+b_i))^2+\lambda(b^2_u+b^2_i).
$$

In [3]:
bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_u': 12,
               'reg_i': 5
               }
baseline = BaselineOnly(bsl_options=bsl_options, verbose=True)
baseline.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x29786dcf8d0>

In [4]:
pred = baseline.test(testset)
pred[:10]

[Prediction(uid='154', iid='302', r_ui=4.0, est=4.294213006413121, details={'was_impossible': False}),
 Prediction(uid='896', iid='484', r_ui=4.0, est=3.630032476033948, details={'was_impossible': False}),
 Prediction(uid='230', iid='371', r_ui=4.0, est=3.458257925450486, details={'was_impossible': False}),
 Prediction(uid='234', iid='294', r_ui=3.0, est=2.5622782715012966, details={'was_impossible': False}),
 Prediction(uid='25', iid='729', r_ui=4.0, est=3.5903947613626372, details={'was_impossible': False}),
 Prediction(uid='249', iid='156', r_ui=5.0, est=4.3494386318087415, details={'was_impossible': False}),
 Prediction(uid='64', iid='447', r_ui=4.0, est=3.3699065522994838, details={'was_impossible': False}),
 Prediction(uid='896', iid='1134', r_ui=3.0, est=2.7709378054991585, details={'was_impossible': False}),
 Prediction(uid='18', iid='153', r_ui=4.0, est=3.8832826951221637, details={'was_impossible': False}),
 Prediction(uid='378', iid='709', r_ui=4.0, est=3.9746148599463846, d

In [5]:
testset[:10]

[('154', '302', 4.0),
 ('896', '484', 4.0),
 ('230', '371', 4.0),
 ('234', '294', 3.0),
 ('25', '729', 4.0),
 ('249', '156', 5.0),
 ('64', '447', 4.0),
 ('896', '1134', 3.0),
 ('18', '153', 4.0),
 ('378', '709', 4.0)]

In [8]:
len(baseline.bi), baseline.bi[:10]

(1653, array([ 0.89635215, -0.27567096,  0.25453814,  0.26963269, -0.8494579 ,
         0.21856154,  0.60372569,  0.59540045,  0.02074645,  0.33199024]))

In [9]:
len(baseline.bu), baseline.bu[:10]

(943, array([-0.57340316, -0.09967058,  0.05739471, -0.22851873,  0.01401689,
        -0.2978673 ,  0.17696704, -0.02960559, -0.65771845, -0.18264165]))

In [12]:
global_mean = trainset.global_mean
global_mean

3.5282375

In [13]:
global_mean + baseline.bu[154] + baseline.bi[302]

2.935450483568054

In [14]:
trainset.to_inner_iid('302'), trainset.to_inner_uid('154')

(171, 738)

In [17]:
global_mean + baseline.bu[738] + baseline.bi[171]

4.294213006413121

上述结果与Prediction(uid='154', iid='302', r_ui=4.0, est=4.294213006413121, details={'was_impossible': False})结果对应，可见该算法的原理是先计算出训练集评分的全局均值$\mu$，然后对于每个用户和物品给定初始偏置项$b_u,b_i$，用bsl中设置的优化算法确定$b_u,b_i$，最终预测值$r_{ui}=\mu+b_u+b_i$。损失函数如下：
$$
\sum_{r_{ui}\in{R_{train}}}(r_{ui}−(\mu+b_u+b_i))^2+\lambda(b^2_u+b^2_i).
$$

In [18]:
accuracy.rmse(pred)

RMSE: 0.9339


0.9338705612178355