In [28]:
import pandas as pd
import os
from surprise import SVD
from surprise import Reader, Dataset
from surprise.model_selection import GridSearchCV
# from surprise import evaluate, print_perf


# 指定文件路径（包含‘user-item-rating’的数据）
# 协同过滤召回阶段不需要timestamp列吧？？？
file_path = "../dataset/dataset1/user.csv"
df = pd.read_csv(file_path, usecols=['用户ID', '电影名', '评分'])
df = df.rename(columns={'评分':'rating', '用户ID':'user', '电影名':'item'})

# 从数据中看评分的值有5种：2,4,6,8,10，范围是(2,10)
# rating_scale, tuple:The minimum and maximal rating of the rating scale.
reader = Reader(rating_scale=(2,10))
data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader)

# 定义需要优选的参数网格
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae', 'fcp'], cv=3)
gs.fit(data)

# 最佳 RMSE 得分
print('最佳RMSE得分\n', gs.best_score['rmse'])
print('最佳FCP得分\n', gs.best_score['fcp'])

# 能达到最佳 RMSE 得分的参数组合
print('最佳RMSE对应的参数组合\n', gs.best_params['rmse'])
print('最佳FCP对应的参数组合\n', gs.best_params['fcp'])

最佳RMSE得分
 2.389602823190996
最佳FCP得分
 0.4991754321724799
最佳RMSE对应的参数组合
 {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
最佳FCP对应的参数组合
 {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}


In [29]:
# 算法效果对比
from surprise import NormalPredictor # 假设评分数据来自一个正态分布
from surprise import BaselineOnly    #
from surprise import KNNWithMeans    # 在基础的CF算法上，去除了平均的均值
from surprise import KNNBaseline     # 在KNNWithMeans基础上，用baseline的值替换均值
from surprise import SVD             # 矩阵分解算法svd(biasSVD与funkSVD)
from surprise import SVDpp           # 考虑了隐反馈
from surprise import NMF             # 非负矩阵分解
from surprise.model_selection import cross_validate

algo = NormalPredictor()
perf = cross_validate(algo, data, measures=['rmse', 'mae', 'fcp'], cv=3)
print('NormalPredictor perf:\n', perf)

algo = BaselineOnly()
perf = cross_validate(algo, data, measures=['rmse', 'mae', 'fcp'], cv=3)
print('BaselineOnly perf:\n', perf)

algo = KNNWithMeans()
perf = cross_validate(algo, data, measures=['rmse', 'mae', 'fcp'], cv=3)
print('KNNWithMeans perf:\n', perf)

algo = KNNBaseline()
perf = cross_validate(algo, data, measures=['rmse', 'mae', 'fcp'], cv=3)
print('KNNBaseline perf:\n', perf)

algo = SVD()
perf = cross_validate(algo, data, measures=['rmse', 'mae', 'fcp'], cv=3)
print('SVD perf:\n', perf)

algo = SVDpp()
perf = cross_validate(algo, data, measures=['rmse', 'mae', 'fcp'], cv=3)
print('SVDpp perf:\n', perf)

algo = NMF()
perf = cross_validate(algo, data, measures=['rmse', 'mae', 'fcp'], cv=3)
print('NMF perf:\n', perf)

NormalPredictor perf:
 {'test_rmse': array([3.4992782 , 3.48622305, 3.48919337]), 'test_mae': array([2.84053553, 2.8271095 , 2.82872745]), 'test_fcp': array([0.50230389, 0.50652295, 0.48729471]), 'fit_time': (0.22785353660583496, 0.29184556007385254, 0.30981993675231934), 'test_time': (0.6586208343505859, 0.57065749168396, 0.743586540222168)}
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
BaselineOnly perf:
 {'test_rmse': array([2.37463381, 2.37773532, 2.3771239 ]), 'test_mae': array([1.9991788 , 1.99596062, 1.99683421]), 'test_fcp': array([0.50407269, 0.50875925, 0.49419509]), 'fit_time': (0.4780704975128174, 0.5392487049102783, 0.5000476837158203), 'test_time': (0.4367482662200928, 0.5326733589172363, 0.5037007331848145)}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
K

In [None]:
# 算法选定，调参完成，进行预测

trainset = data.build_full_trainset()
# sim_options = {'name': 'pearson_baseline', 'user_based':False}
# # 基线估计(Baselines estimates)配置
# bsl_options = {'method':'als',
#               'reg_u':12,
#               'reg_i':5,
#               'n_epochs':5,}

# # bsl_options = {'method':'sgd',
# #               'learning_rate':.00005,
# #               'n_epochs':10}
# algo = KNNBaseline(k=20, min_k=1, sim_options=sim_options, bsl_options=bsl_options)
# algo.fit(trainset)

# 假设给定一个用户列表，给每一个用户推荐topN
user_inner_id_list = trainset.all_users()
for user_inner_id in user_inner_id_list:
    



In [None]:
data.head()

In [None]:
a = pd.read_csv("../dataset/dataset1/user.csv", encoding='utf-8')
a.head()