In [1]:
from surprise import KNNWithZScore, accuracy, Dataset
from surprise.model_selection import train_test_split

In [2]:
data = Dataset.load_builtin("ml-100k")
trainset, testset = train_test_split(data, test_size=.2, random_state=10, shuffle=True)

## KNNWithZScore
一种基本的协同过滤算法，考虑到每个用户的平均评分和评分方差。  
The prediction $\hat{r_{ui}}$ is set as:
$$
\hat{r_{ui}}=\mu_u+\sigma_u\frac{\sum_{v\in{N^k_i(u)}}sim(u,v)⋅(r_{vi}−\mu_v)/\sigma_v}{\sum_{v\in{N^k_i(u)}}sim(u,v)}
$$
or  
$$
\hat{r_{ui}}=\mu_i+\sigma_i\frac{\sum_{j\in{N^k_u(i)}}sim(i,j)⋅(r_{uj}−\mu_j)/\sigma_j}{\sum_{j\in{N^k_u(i)}}sim(i,j)}
$$

In [3]:
sim_options = {
    'name':'pearson_baseline',
    'min_support':0,
    'user_based':True,
    "shrinkage":100
}
model = KNNWithZScore(
    k=30,
    min_k=1,
    sim_options=sim_options,
    verbose=True
)
model.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x22b5294e390>

In [4]:
#用户的相似度矩阵，对称矩阵
model.sim[:5, :5], model.sim.shape

(array([[ 1.        ,  0.00288911, -0.01398557,  0.01750217,  0.00828402],
        [ 0.00288911,  1.        ,  0.03474046,  0.00456765, -0.00237884],
        [-0.01398557,  0.03474046,  1.        ,  0.02321789, -0.02159672],
        [ 0.01750217,  0.00456765,  0.02321789,  1.        ,  0.03779227],
        [ 0.00828402, -0.00237884, -0.02159672,  0.03779227,  1.        ]]),
 (943, 943))

In [5]:
pred = model.test(testset)
pred[:10]

[Prediction(uid='154', iid='302', r_ui=4.0, est=4.4379295008668125, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='896', iid='484', r_ui=4.0, est=3.453858570329012, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='230', iid='371', r_ui=4.0, est=3.5744438733171213, details={'actual_k': 27, 'was_impossible': False}),
 Prediction(uid='234', iid='294', r_ui=3.0, est=2.4590286813499747, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='25', iid='729', r_ui=4.0, est=4.111132011921498, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='249', iid='156', r_ui=5.0, est=4.813341348943073, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='64', iid='447', r_ui=4.0, est=3.5957262363407274, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='896', iid='1134', r_ui=3.0, est=2.4737336782154853, details={'actual_k': 13, 'was_impossible': False}),
 Prediction(uid='18', iid='153', r_ui=4.0, e

In [6]:
accuracy.rmse(pred)

RMSE: 0.9264


0.926407579942053