# 相关训练过程

1. 读取数据集

In [1]:
import pandas as pd

ratings = pd.read_csv(
    "../datasets/ml-25m/ratings.csv"
)
movies = pd.read_csv(
    "../datasets/ml-25m/movies.csv",
)
df = pd.merge(ratings, movies[['movieId', 'title']], on='movieId')


2. 构建矩阵

这里由于矩阵如果直接使用非常pivot_table去构建一个评分矩阵会爆内存，所以这里对索引进行编码后，使用scipy的csr_matrix构建矩阵

In [4]:
from scipy.sparse import csr_matrix

# 编码（避免巨大稀疏索引）
user_codes = df['userId'].astype('category').cat.codes
movie_codes = df['movieId'].astype('category').cat.codes

ratings_sparse = csr_matrix(
    (df['rating'], (user_codes, movie_codes))
)

print(ratings_sparse.shape)
print(ratings_sparse.nnz)   # 非零元素数量

(162541, 59047)
25000095


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def topk_user_similarity(ratings_sparse, k=50):
    n_users = ratings_sparse.shape[0]
    topk_sim = {}

    for i in range(n_users):
        # 计算第 i 个用户与所有用户的余弦相似度
        sim = cosine_similarity(ratings_sparse[i], ratings_sparse).flatten()
        sim[i] = 0  # 排除自己
        topk_idx = np.argpartition(-sim, k)[:k]  # 取前 k
        topk_sim[i] = list(zip(topk_idx, sim[topk_idx]))

    return topk_sim

topk_sim_dict = topk_user_similarity(ratings_sparse, k=50)

KeyboardInterrupt: 

In [5]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# # 用 0 替换 NaN 计算相似度
# user_matrix = rating_matrix.fillna(0)

# # 计算用户-用户相似度矩阵
# user_similarity = cosine_similarity(user_matrix)
# user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

# user_similarity.head()


In [6]:
# target_user = 1
# top_users = user_similarity[target_user].sort_values(ascending=False)[1:6]
# print(top_users)


In [7]:
# # 权重相似度的加权平均
# def predict_ratings(user_id, rating_matrix, user_similarity):
#     sim_scores = user_similarity[user_id]
#     user_ratings = rating_matrix.loc[user_id]
    
#     # 未评分的电影
#     unrated_movies = user_ratings[user_ratings.isna()].index
    
#     pred_ratings = {}
#     for movie in unrated_movies:
#         # 取其他用户对该电影的评分
#         other_ratings = rating_matrix[movie]
        
#         # 计算加权平均（忽略NaN）
#         mask = ~other_ratings.isna()
#         if mask.sum() == 0:
#             continue
#         pred = np.dot(sim_scores[mask], other_ratings[mask]) / sim_scores[mask].sum()
#         pred_ratings[movie] = pred
#     return pd.Series(pred_ratings).sort_values(ascending=False)

# # 给用户1推荐电影
# predicted_ratings = predict_ratings(4, rating_matrix, user_similarity)
# predicted_ratings.head(20)