In [1]:
from surprise import KNNBaseline, accuracy, Dataset
from surprise.model_selection import train_test_split

In [2]:
data = Dataset.load_builtin("ml-100k")
trainset, testset = train_test_split(data, test_size=.2, random_state=10, shuffle=True)

## KNNBaseline
一种基本协同过滤算法，考虑到基线评分。  
The prediction $\hat{r_{ui}}$ is set as:
$$
\hat{r_{ui}}=b_{ui}+\frac{\sum_{v\in{N^k_i(u)}}sim(u,v)⋅(r_{vi}−b_{vi})}{\sum_{v\in{N^k_i(u)}}sim(u,v)}
$$
or
$$
\hat{r_{ui}}=b_{ui}+\frac{\sum_{j\in{N^k_u(i)}}sim(i,j)⋅(r_{uj}−b_{uj})}{\sum_{j\in{N^k_u(i)}}sim(i,j)}
$$

In [3]:
sim_options = {
    'name':'pearson_baseline',
    'min_support':0,
    'user_based':True,
    "shrinkage":100
}
bsl_options = {
    'method': 'als',
    'n_epochs': 10,
    'reg_u': 12,
    'reg_i': 5
}
model = KNNBaseline(
    k=30,
    min_k=1,
    sim_options=sim_options,
    bsl_options=bsl_options,
    verbose=True
)
model.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x222da0a7978>

In [4]:
#用户的相似度矩阵，对称矩阵
sim = model.sim
sim[:5, :5], sim.shape

(array([[ 1.        ,  0.00270557, -0.01482526,  0.01699328,  0.00849375],
        [ 0.00270557,  1.        ,  0.03473093,  0.00504646, -0.00235382],
        [-0.01482526,  0.03473093,  1.        ,  0.02100351, -0.02198502],
        [ 0.01699328,  0.00504646,  0.02100351,  1.        ,  0.03860745],
        [ 0.00849375, -0.00235382, -0.02198502,  0.03860745,  1.        ]]),
 (943, 943))

In [5]:
pred = model.test(testset)
pred[:10]

[Prediction(uid='154', iid='302', r_ui=4.0, est=4.3011830747708295, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='896', iid='484', r_ui=4.0, est=3.561435096905012, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='230', iid='371', r_ui=4.0, est=3.471466749152098, details={'actual_k': 28, 'was_impossible': False}),
 Prediction(uid='234', iid='294', r_ui=3.0, est=2.345989406310192, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='25', iid='729', r_ui=4.0, est=3.8041289217402543, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='249', iid='156', r_ui=5.0, est=4.828685131096553, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='64', iid='447', r_ui=4.0, est=3.4591623955639337, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='896', iid='1134', r_ui=3.0, est=2.5112712801695345, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid='18', iid='153', r_ui=4.0, es

In [6]:
accuracy.rmse(pred)

RMSE: 0.9132


0.9131574961896006

In [7]:
bi = model.bi
bi[:10]

array([ 0.89635215, -0.27567096,  0.25453814,  0.26963269, -0.8494579 ,
        0.21856154,  0.60372569,  0.59540045,  0.02074645,  0.33199024])

In [8]:
bu = model.bu
bu[:10]

array([-0.57340316, -0.09967058,  0.05739471, -0.22851873,  0.01401689,
       -0.2978673 ,  0.17696704, -0.02960559, -0.65771845, -0.18264165])

In [9]:
model.bx[:10]#和bu一样

array([-0.57340316, -0.09967058,  0.05739471, -0.22851873,  0.01401689,
       -0.2978673 ,  0.17696704, -0.02960559, -0.65771845, -0.18264165])

In [10]:
trainset.to_inner_iid('302'), trainset.to_inner_uid('154')

(171, 738)

In [11]:
import heapq
ir = trainset.ir[171]
neighbors = [(u, sim[738, u], r) for (u, r) in ir]
k_neighbors = heapq.nlargest(30, neighbors, key=lambda t: t[1])

In [12]:
global_mean = trainset.global_mean

In [13]:
sum_sim = sum_ratings = actual_k = 0
for (nb, sim, r) in k_neighbors:
    if sim > 0:
        sum_sim += sim
        nb_bsl = global_mean + bu[nb] + bi[171]
        sum_ratings += sim * (r - nb_bsl)
        actual_k += 1
sum_ratings, sum_sim, actual_k

(0.010150830671586102, 1.4563459281371463, 30)

In [15]:
est = sum_ratings / sum_sim + bu[738] + bi[171] + global_mean
est

4.3011830747708295

计算结果与模型pred输出一致
$$
\hat{r_{ui}}=\mu+b_u+b_i+\frac{\sum_{v\in{N^k_i(u)}}sim(u,v)⋅(r_{vi}−\mu-b_v-b_i)}{\sum_{v\in{N^k_i(u)}}sim(u,v)}
$$
or
$$
\hat{r_{ui}}=\mu+b_u+b_i+\frac{\sum_{j\in{N^k_u(i)}}sim(i,j)⋅(r_{uj}−\mu−b_u-b_j)}{\sum_{j\in{N^k_u(i)}}sim(i,j)}
$$