In [2]:
import pandas as pd

ratings = pd.read_csv("../datasets/ml-25m/ratings.csv")
movies = pd.read_csv(
    "../datasets/ml-25m/movies.csv",
)
df = pd.merge(ratings, movies[["movieId", "title"]], on="movieId")


In [3]:
from scipy.sparse import csr_matrix

# 编码（避免巨大稀疏索引）
user_codes = df["userId"].astype("category").cat.codes
movie_codes = df["movieId"].astype("category").cat.codes

ratings_sparse = csr_matrix((df["rating"], (user_codes, movie_codes)))

print(ratings_sparse.shape)
print(ratings_sparse.nnz)  # 非零元素数量


(162541, 59047)
25000095


In [4]:
def splite_data(ratings_sparse, test_size=0.2):
    from sklearn.model_selection import train_test_split

    train_data, test_data = train_test_split(
        range(ratings_sparse.shape[0]),
        test_size=test_size,
        random_state=42,
    )
    return train_data, test_data

train_idx, test_idx = splite_data(ratings_sparse,test_size=0.01)
train_data = ratings_sparse[train_idx]
test_data = ratings_sparse[test_idx]
print(train_data.shape, test_data.shape)
print(train_data.nnz, test_data.nnz)

(160915, 59047) (1626, 59047)
24749602 250493


In [5]:
import numpy as np

val_data = test_data.copy()
def mask_test_data(test_data, frac=0.2):
    data = test_data.data
    indices = test_data.indices
    indptr = test_data.indptr
    shape = test_data.shape
    
    mask = np.ones_like(data, dtype=bool)
    for i in range(test_data.shape[0]):
        start = indptr[i]
        end = indptr[i + 1]
        n_ratings = end - start
        n_mask = max(1, int(n_ratings * frac))
        if n_ratings > 0:
            mask_indices = np.random.choice(
                np.arange(start, end), size=n_mask, replace=False
            )
            mask[mask_indices] = False
    data_masked = data[mask]
    indices_masked = indices[mask]
    indptr_masked = np.zeros(shape[0] + 1, dtype=indptr.dtype)
    retained_positions = np.where(mask)[0]
    row_assignments = np.searchsorted(indptr, retained_positions, side="right") - 1
    row_counts = np.bincount(row_assignments, minlength=shape[0])
    cumulative_sum = np.cumsum(row_counts)
    indptr_masked[1:] = cumulative_sum
    return csr_matrix((data_masked, indices_masked, indptr_masked), shape=test_data.shape)

test_data = mask_test_data(test_data, frac=0.2)
print(test_data.shape, val_data.shape)
print(test_data.nnz, val_data.nnz)

(1626, 59047) (1626, 59047)
201012 250493


In [30]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
import hnswlib


def hnswlib_topk_svd(train_sparse,test_sparse, k=50, dim=128):
    svd = TruncatedSVD(n_components=dim)
    train = svd.fit_transform(train_sparse).astype("float32")

    train_norm = normalize(train, axis=1).astype("float32")
    n_users_train, dim = train_norm.shape
    n_users_test = test_sparse.shape[0]
    test = svd.transform(test_sparse).astype("float32")
    test_norm = normalize(test, axis=1).astype("float32")

    index = hnswlib.Index(space="cosine", dim=dim)
    index.init_index(max_elements=n_users_train, ef_construction=200, M=32)
    index.add_items(train_norm)
    index.set_ef(128)

    top_ids, top_dists = index.knn_query(test_norm, k + 1)

    topk = {}
    for u in range(n_users_test):
        ids = top_ids[u]
        dists = top_dists[u]
        topk[u] = list(zip(ids[:k].tolist(), (1 - dists[:k]).tolist()))
    return topk


topk_dict = hnswlib_topk_svd(train_data, test_data, k=50)
print(topk_dict[0])


[(74702, 0.8115955591201782), (82317, 0.8071398735046387), (57888, 0.7959361672401428), (29629, 0.7842761278152466), (99576, 0.7763625979423523), (57636, 0.7716629505157471), (112559, 0.7714238166809082), (9605, 0.7642025351524353), (101887, 0.7566736340522766), (64357, 0.7540217638015747), (81077, 0.7518483400344849), (91539, 0.7490201592445374), (31418, 0.743718683719635), (21975, 0.7339298725128174), (20690, 0.7337057590484619), (95734, 0.7261963486671448), (82974, 0.7246642112731934), (30153, 0.7174654603004456), (135650, 0.715464174747467), (22494, 0.7121829986572266), (114861, 0.7100189924240112), (86831, 0.7018918395042419), (57081, 0.6998987197875977), (93155, 0.6988957524299622), (40605, 0.6986770033836365), (5784, 0.6978859901428223), (36425, 0.6964825391769409), (94396, 0.6920182108879089), (98849, 0.6873684525489807), (141180, 0.6872909069061279), (147467, 0.6863797903060913), (86858, 0.6847625374794006), (141787, 0.6842553615570068), (4601, 0.6837467551231384), (35430, 0.6

In [7]:
def rmse(topk_dict, val_data):
    from sklearn.metrics import root_mean_squared_error

    y_true = []
    y_pred = []

    # 将train_data转换为易于访问的形式
    train_ratings = {}
    for u in range(train_data.shape[0]):
        start = train_data.indptr[u]
        end = train_data.indptr[u + 1]
        for i in range(start, end):
            movie = train_data.indices[i]
            rating = train_data.data[i]
            train_ratings[(u, movie)] = rating

    # 对验证集中的每个用户进行预测
    for u in range(val_data.shape[0]):
        start = val_data.indptr[u]
        end = val_data.indptr[u + 1]
        movie_indices = val_data.indices[start:end]
        true_ratings = val_data.data[start:end]

        if len(true_ratings) == 0:
            continue

        # 获取用户的最近邻及相似度
        neighbors = topk_dict.get(u, [])

        # 对用户观看的每部电影进行预测
        for movie, true_rating in zip(movie_indices, true_ratings):
            numerator = 0.0  # 分子：加权评分总和
            denominator = 0.0  # 分母：相似度总和

            # 基于邻居的评分进行加权平均
            for neighbor_id, similarity in neighbors:
                neighbor_rating = train_ratings.get((neighbor_id, movie), None)
                if neighbor_rating is not None:
                    numerator += similarity * neighbor_rating
                    denominator += abs(similarity)

            # 预测评分
            if denominator > 0:
                predicted_rating = numerator / denominator
                # 确保评分在合理范围内（如1-5）
                predicted_rating = max(1.0, min(5.0, predicted_rating))
            else:
                predicted_rating = 0  # 或使用全局平均分

            y_true.append(true_rating)
            y_pred.append(predicted_rating)

    rmse_value = root_mean_squared_error(y_true, y_pred)
    return rmse_value

rmse_value = rmse(topk_dict, val_data)
print(f"RMSE: {rmse_value:.4f}")

RMSE: 1.1746


In [32]:
import numpy as np
from collections import Counter


def recall_from_user_neighbors(topk_dict, train_mat, test_mat, N=50, like_threshold=1.0):
    n_users = test_mat.shape[0]
    recalls = []

    for u in range(n_users):
        # ground truth: 用户在测试集中真正喜欢的电影
        test_row = test_mat.getrow(u)
        liked_items = test_row.indices[test_row.data >= like_threshold]
        if len(liked_items) == 0:
            continue
        liked_set = set(liked_items)

        # 用户邻居
        neighbors = topk_dict[u]
        neighbor_ids = [nid for nid, sim in neighbors]

        # 收集邻居的电影
        movie_counter = Counter()
        for nid in neighbor_ids:
            row = train_mat.getrow(nid)
            movies = row.indices  # 简单方案：不加权
            movie_counter.update(movies)

        # 去掉用户训练集里已经看过的电影（避免泄漏）
        train_row_u = train_mat.getrow(u)
        seen_u = set(train_row_u.indices)

        # 剔除
        for m in seen_u:
            if m in movie_counter:
                del movie_counter[m]

        # 预测的 Top-N 电影
        if len(movie_counter) == 0:
            recalls.append(0.0)
            continue
        pred_items = [m for m, c in movie_counter.most_common(N)]
        pred_set = set(pred_items)

        # 计算召回率
        hit = len(liked_set & pred_set)
        recall = hit / len(liked_set)
        recalls.append(recall)

    if len(recalls) == 0:
        return 0.0

    return np.mean(recalls)
recall_score = recall_from_user_neighbors(topk_dict, train_data, val_data, N=50)
print(f"基于邻居的召回率: {recall_score:.4f}")

基于邻居的召回率: 0.3425
