# 相关训练过程

1. 读取数据集

In [1]:
import pandas as pd

ratings = pd.read_csv(
    "../datasets/ml-25m/ratings.csv"
)
movies = pd.read_csv(
    "../datasets/ml-25m/movies.csv",
)
df = pd.merge(ratings, movies[['movieId', 'title']], on='movieId')


In [2]:
import pandas as pd
import numpy as np


# 为了可重复性
rng = np.random.default_rng(seed=42)


def split_low_memory(df, test_ratio=0.2, min_items=5):
    test_indices = []

    # 仅遍历分组，不做 df 操作
    for user, group in df.groupby("userId"):
        if len(group) < min_items:
            continue

        test_size = max(1, int(len(group) * test_ratio))
        idx = rng.choice(group.index, size=test_size, replace=False)
        test_indices.extend(idx)

    test_indices = set(test_indices)

    # 最关键的：一次性切
    mask = df.index.isin(test_indices)
    test_df = df[mask]
    train_df = df[~mask]

    return train_df, test_df


train_df, test_df = split_low_memory(df)
print("train:", len(train_df), "test:", len(test_df))

train: 20062533 test: 4937562


回收df

In [None]:
import gc
del df

gc.collect()

0

In [10]:
from scipy.sparse import csr_matrix

def create_sparse_matrix(train_df, test_df):
    # Step 1: 只在训练集上做 category
    user_cat = train_df["userId"].astype("category")
    movie_cat = train_df["movieId"].astype("category")

    train_user_codes = user_cat.cat.codes
    train_movie_codes = movie_cat.cat.codes

    # Step 2: 拿到训练集的类别列表
    user_categories = user_cat.cat.categories
    movie_categories = movie_cat.cat.categories

    # Step 3: 测试集用训练集的 categories 做映射
    test_user_codes = pd.Categorical(test_df["userId"], categories=user_categories).codes
    test_movie_codes = pd.Categorical(test_df["movieId"], categories=movie_categories).codes

    # Step 4: 对于测试集中未出现在训练集的用户/电影，编码为 -1，需要过滤掉
    mask = (test_user_codes != -1) & (test_movie_codes != -1)

    test_user_codes = test_user_codes[mask]
    test_movie_codes = test_movie_codes[mask]
    test_ratings = test_df["rating"].values[mask]

    # Step 5: 稀疏矩阵
    train_sparse = csr_matrix((train_df["rating"], (train_user_codes, train_movie_codes)))

    test_sparse = csr_matrix((test_ratings, (test_user_codes, test_movie_codes)), shape=train_sparse.shape)

    return train_sparse, test_sparse

train_sparse, test_sparse = create_sparse_matrix(train_df, test_df)
print(train_sparse.shape)
print(train_sparse.nnz)

(162541, 56654)
20062533


In [11]:
print(test_sparse.shape)
print(test_sparse.nnz)

(162541, 56654)
4934713


In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
import hnswlib


def hnswlib_topk_svd(ratings_sparse, k=50, dim=128):
    svd = TruncatedSVD(n_components=dim)
    X = svd.fit_transform(ratings_sparse).astype("float32")

    X_norm = normalize(X, axis=1).astype("float32")
    n_users, dim = X_norm.shape

    index = hnswlib.Index(space="cosine", dim=dim)
    index.init_index(max_elements=n_users, ef_construction=200, M=32)
    index.add_items(X_norm)
    index.set_ef(128)

    top_ids, top_dists = index.knn_query(X_norm, k + 1)

    topk = {}
    for u in range(n_users):
        ids = top_ids[u]
        dists = top_dists[u]
        mask = ids != u
        topk[u] = list(zip(ids[mask][:k].tolist(), (1 - dists[mask][:k]).tolist()))
    return topk


topk_dict = hnswlib_topk_svd(ratings_sparse, k=50)
print(topk_dict[0])
