# 相关训练过程

1. 读取数据集

In [1]:
import pandas as pd

ratings = pd.read_csv(
    "../datasets/ml-25m/ratings.csv"
)
movies = pd.read_csv(
    "../datasets/ml-25m/movies.csv",
)
df = pd.merge(ratings, movies[['movieId', 'title']], on='movieId')


2. 构建矩阵

这里由于矩阵如果直接使用非常pivot_table去构建一个评分矩阵会爆内存，所以这里对索引进行编码后，使用scipy的csr_matrix构建矩阵

In [2]:
from scipy.sparse import csr_matrix

# 编码（避免巨大稀疏索引）
user_codes = df['userId'].astype('category').cat.codes
movie_codes = df['movieId'].astype('category').cat.codes

ratings_sparse = csr_matrix(
    (df['rating'], (user_codes, movie_codes))
)

print(ratings_sparse.shape)
print(ratings_sparse.nnz)   # 非零元素数量

(162541, 59047)
25000095


In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def topk_user_similarity(ratings_sparse, k=50):
    sim = cosine_similarity(ratings_sparse, ratings_sparse)
    np.fill_diagonal(sim, 0)  # 自己和自己的相似度设为0
    topk_idx = np.argpartition(-sim, k)[:,:k]
    topk_sim = {
        i: [(j, sim[j]) for j in topk_idx[i]]
        for i in range(ratings_sparse.shape[0])
    }

    return topk_sim

topk_sim_dict = topk_user_similarity(ratings_sparse, k=50)

MemoryError: Unable to allocate 174. GiB for an array with shape (23330235575,) and data type int64

这里爆内存了，数据量过于庞大，计算时间过长，在实践应用中完全不可行

In [5]:
import hnswlib
from sklearn.preprocessing import normalize


def hnswlib_topk_user_similarity(ratings_sparse, k=50, ef=200, M=48):
    """
    使用 HNSW (hnswlib) 做用户-用户 top-k 余弦相似度搜索。
    与 FAISS 版本行为一致（内积 = 余弦）。
    """

    # -------- 1. CSR → dense（如果太大你可以换成稀疏降维）
    X = ratings_sparse.toarray().astype("float32")

    # -------- 2. L2 normalize（这样内积 = 余弦相似度）
    X_norm = normalize(X, axis=1).astype("float32")

    n_users, dim = X_norm.shape

    # -------- 3. 初始化 HNSW index（使用内积空间）
    index = hnswlib.Index(
        space="ip",  # 内积（cosine 兼容）
        dim=dim,
    )

    # 初始化索引结构（M：图的连接度）
    index.init_index(max_elements=n_users, ef_construction=ef, M=M)

    # -------- 4. 添加向量
    index.add_items(X_norm)

    # -------- 5. 设置搜索参数
    index.set_ef(ef)

    # -------- 6. 查询 top-k+1（会包含自身）
    labels, distances = index.knn_query(X_norm, k=k + 1)

    # -------- 7. 构造结果
    topk_dict = {}
    for user in range(n_users):
        ids = labels[user]
        sims = distances[user]

        # 去掉自身
        mask = ids != user
        top_ids = ids[mask][:k]
        top_sims = sims[mask][:k]

        topk_dict[user] = list(zip(top_ids.tolist(), top_sims.tolist()))

    return topk_dict

topk_dict = hnswlib_topk_user_similarity(ratings_sparse, k=50)
print(topk_dict[0])

MemoryError: Unable to allocate 71.5 GiB for an array with shape (162541, 59047) and data type float64

In [7]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
import hnswlib


def hnswlib_topk_svd(ratings_sparse, k=50, dim=128):
    svd = TruncatedSVD(n_components=dim)
    X = svd.fit_transform(ratings_sparse).astype("float32")

    X_norm = normalize(X, axis=1).astype("float32")
    n_users, dim = X_norm.shape

    index = hnswlib.Index(space="cosine", dim=dim)
    index.init_index(max_elements=n_users, ef_construction=200, M=32)
    index.add_items(X_norm)
    index.set_ef(128)

    top_ids, top_dists = index.knn_query(X_norm, k + 1)

    topk = {}
    for u in range(n_users):
        ids = top_ids[u]
        dists = top_dists[u]
        mask = ids != u
        topk[u] = list(zip(ids[mask][:k].tolist(), (1 - dists[mask][:k]).tolist()))
    return topk
topk_dict = hnswlib_topk_svd(ratings_sparse, k=50)
print(topk_dict[0])

[(4504, 0.6947439908981323), (88041, 0.6860814094543457), (68920, 0.6663422584533691), (5380, 0.6532018780708313), (101270, 0.6452866196632385), (96039, 0.6431542038917542), (49308, 0.642228901386261), (162312, 0.6404700875282288), (28910, 0.6395902633666992), (82442, 0.6370313167572021), (97639, 0.6355757117271423), (7430, 0.6338117122650146), (156351, 0.6273589134216309), (125172, 0.6268659234046936), (81512, 0.6244006156921387), (30084, 0.6231658458709717), (84233, 0.6198986768722534), (57754, 0.6176114082336426), (30872, 0.6153122186660767), (1200, 0.6148878335952759), (152227, 0.612818717956543), (117248, 0.6119475364685059), (2591, 0.6080026030540466), (49815, 0.6068069338798523), (40897, 0.6061033606529236), (78369, 0.603630542755127), (133795, 0.6011833548545837), (50205, 0.6001582741737366), (77562, 0.5983670949935913), (126719, 0.5973210334777832), (136248, 0.5961025953292847), (157949, 0.5944840908050537), (69684, 0.5930356979370117), (140235, 0.5929831266403198), (139476, 0

In [9]:
import numpy as np


def predict_rating_userCF(user, item, ratings_sparse, topk_dict):
    neighbors = topk_dict[user]

    num = 0.0
    den = 0.0

    for v, sim in neighbors:
        r_vi = ratings_sparse[v, item]  # 稀疏矩阵支持索引
        if r_vi > 0:  # 仅使用有评分的邻居
            num += sim * r_vi
            den += sim

    if den == 0:
        return 0  # 或者返回用户均值、全局均值等

    return num / den
rating = predict_rating_userCF(0, 1234, ratings_sparse, topk_dict)
print(rating)

2.5


In [None]:
def recommend_items_userCF(user, ratings_sparse, topk_dict, topN=10):
    n_items = ratings_sparse.shape[1]
    user_rated = ratings_sparse[user].indices
    predictions = []

    for item in range(n_items):
        if item in user_rated:
            continue  # 跳过已看
        pred = predict_rating_userCF(user, item, ratings_sparse, topk_dict)
        predictions.append((item, pred))

    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:topN]
print(recommend_items_userCF(0, ratings_sparse, topk_dict, topN=10))


[(226, np.float64(5.0)), (342, np.float64(5.0)), (490, np.float64(5.0)), (575, np.float64(5.0)), (657, np.float64(5.0)), (705, np.float64(5.0)), (765, np.float64(5.0)), (1141, np.float64(5.0)), (1161, np.float64(5.0)), (1173, np.float64(5.0))]
