In [1]:
from surprise import KNNBasic, accuracy, Dataset
from surprise.model_selection import train_test_split

In [2]:
data = Dataset.load_builtin("ml-100k")
trainset, testset = train_test_split(data, test_size=.2, random_state=10, shuffle=True)

## KNNBasic
$N^k_i(u)和N^k_u(i)$只包含相似度为正的邻居。
The prediction $\hat{r}_{ui} $is set as:
$$
\hat{r}_{ui} = \frac{ \sum\limits_{v \in N^k_i(u)} \text{sim}(u, v) \cdot r_{vi}} {\sum\limits_{v \in N^k_i(u)} \text{sim}(u, v)}
$$
or
$$
\hat{r}_{ui} = \frac{ \sum\limits_{j \in N^k_u(i)} \text{sim}(i, j) \cdot r_{uj}} {\sum\limits_{j \in N^k_u(i)} \text{sim}(i, j)}
$$

In [4]:
sim_options = {
    "name":"pearson_baseline",#相似度计算方法，'cosine','pearson_baseline','MSD'(均方差)
    "user_based":True,
    "min_support":20,#两个用户u,v如果共同评分过的项目数量小于min_support，则u,v相似度为0
    "shrinkage":100,#Shrinkage parameter to apply (only relevant for pearson_baseline similarity). Default is 100.
}
model = KNNBasic(
    k=30,
    min_k=1,#defalut 1.表示需要考虑最小邻居数量，如果邻居数量（满足相似度为正的条件）小于min_k，则预测值等于全局均值
    sim_options=sim_options,
    verbose=True
)
model.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x20b81237630>

In [8]:
#用户的相似度矩阵，对称矩阵
model.sim[:5, :5], model.sim.shape

(array([[1.        , 0.00288911, 0.        , 0.        , 0.        ],
        [0.00288911, 1.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 1.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 1.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 1.        ]]),
 (943, 943))

In [9]:
model.sim_options

{'name': 'pearson_baseline',
 'user_based': True,
 'min_support': 20,
 'shrinkage': 100}

In [10]:
model.bsl_options

{}

In [11]:
pred = model.test(testset)
pred[:10]

[Prediction(uid='154', iid='302', r_ui=4.0, est=4.194023300579871, details={'actual_k': 17, 'was_impossible': False}),
 Prediction(uid='896', iid='484', r_ui=4.0, est=4.145543093407065, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='230', iid='371', r_ui=4.0, est=3.1107402596055462, details={'actual_k': 24, 'was_impossible': False}),
 Prediction(uid='234', iid='294', r_ui=3.0, est=2.8370435671077905, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='25', iid='729', r_ui=4.0, est=3.7212408939437007, details={'actual_k': 28, 'was_impossible': False}),
 Prediction(uid='249', iid='156', r_ui=5.0, est=4.4089549076794965, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='64', iid='447', r_ui=4.0, est=3.5391499857991717, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid='896', iid='1134', r_ui=3.0, est=3.040980734834555, details={'actual_k': 11, 'was_impossible': False}),
 Prediction(uid='18', iid='153', r_ui=4.0, e

In [12]:
accuracy.rmse(pred)

RMSE: 1.0215


1.0215369179757938

'min_support'调小试试

In [13]:
model.sim_options = {
    'name': 'pearson_baseline',
    'user_based': True,
    'min_support': 0,
    'shrinkage': 100
}
model.fit(trainset)
new_pred = model.test(testset)
accuracy.rmse(new_pred)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9932


0.9932383104795434

In [14]:
model.sim[:5, :5]

array([[ 1.        ,  0.00288911, -0.01398557,  0.01750217,  0.00828402],
       [ 0.00288911,  1.        ,  0.03474046,  0.00456765, -0.00237884],
       [-0.01398557,  0.03474046,  1.        ,  0.02321789, -0.02159672],
       [ 0.01750217,  0.00456765,  0.02321789,  1.        ,  0.03779227],
       [ 0.00828402, -0.00237884, -0.02159672,  0.03779227,  1.        ]])