In [31]:
import os
from surprise import Reader, Dataset
# 指定文件路径
file_path = os.path.expanduser('../../data/data_suprise_format.csv')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',',skip_lines=1, rating_scale=(1, 10))
# 从文件读取数据
data = Dataset.load_from_file(file_path, reader=reader)
# 分成5折
data

<surprise.dataset.DatasetAutoFolds at 0x25d53cb7748>

In [32]:
from surprise.model_selection import KFold

In [33]:
kf = KFold(n_splits=5)

In [34]:
### 使用SVD测试
from surprise import SVD, accuracy
algo = SVD()

for trainset, testset in kf.split(data):
    # 训练
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算rmse和mae
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

RMSE: 2.3211
MAE:  1.9003
RMSE: 2.3049
MAE:  1.8901
RMSE: 2.3100
MAE:  1.8973
RMSE: 2.3124
MAE:  1.8984
RMSE: 2.3081
MAE:  1.8966


In [35]:
### 使用均值协同过滤测试
from surprise import KNNWithMeans
algo = KNNWithMeans()

for trainset, testset in kf.split(data):
    # 训练
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算rmse和mae
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.3931
MAE:  1.8234
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.3831
MAE:  1.8140
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.3926
MAE:  1.8172
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.3955
MAE:  1.8219
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.3916
MAE:  1.8222


In [36]:
from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io

# 计算电影之间的相似度
print("构建数据集...")
trainset = data.build_full_trainset()

构建数据集...


In [37]:
trainset.n_items

23031

In [38]:
trainset.n_users

13545

In [39]:
# from surprise import KNNBaseline
from collections import defaultdict
print("开始训练模型...")
# algo = KNNBaseline()
algo = SVD()
algo.fit(trainset)

开始训练模型...


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25d5816f630>

In [40]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [41]:
# testset = trainset.build_anti_testset()
#内存要求太高，全量跑不动

In [47]:
import random 
def build_anti_testset(trainset,fill=None, n=10):
    fill = trainset.global_mean if fill is None else float(fill)
    anti_testset = []
    for u in random.sample(trainset.all_users(),n):
        user_items = set([j for (j, _) in trainset.ur[u]])
        anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i),fill) for
                         i in trainset.all_items() if
                         i not in user_items]
    return anti_testset
testset = build_anti_testset(trainset,n=50)

In [48]:
predictions = algo.test(testset)

In [49]:
import pandas as pd
top_n = get_top_n(predictions, n=10)

#获得用户ID-用户名/电影ID-电影名的匹配列表
uid2name = pd.read_csv('../../data/user_list.csv')
mid2name = pd.read_csv('../../data/movie_list.csv')

In [54]:
# 打印每个用户的topk推荐电影并保存为列表
import json
user_movie_info = []
for uid, user_ratings in top_n.items():
    user_movie_info.append((uid, json.dumps(user_ratings)))
    username = uid2name[uid2name['用户ID'].astype("str") == uid].values[0][1]
    print(username, 
          [str(mid2name[mid2name['电影ID'].astype("str") == iid].values[0][1]) for (iid, _) in user_ratings])

八爪的小飞船 ['太阳底下', '毛驴县令之一奶同胞', '留住有情人', '妈妈的记忆', '待避', '玩命剧组', '百万金臂', '这样的爱', '怪兽：黑暗大陆', '地下']
复仇者的从容 ['阿德尔曼夫妇', '有缘的陌生人', '独立愚连队西行', '大佛普拉斯', '太阳底下', '蓝色迷情', '我的脑内恋碍选项OVA', '火线反攻', '未来忍者', '盐湖城朋克']
徐渭 ['妈妈的记忆', '重启咲良田 前篇', '吻', '扑通扑通我的人生', '夏日之王', '这样的爱', '尽善尽美', '美味餐厅', '三只小孤儿猫', '一年中的九天']
Quasar， ['妈妈的记忆', '毛驴县令之一奶同胞', '扑通扑通我的人生', '夏日之王', '三只小孤儿猫', '神犬侦探', '冷场', '疯狂侏罗纪', '路过未来', '水獭塔卡']
Hunter ['我世界的中心', '男孩与鹈鹕', '妈妈的记忆', '十三小时', '谍影重重5', '天堂的颜色', '逆时·恒美', '太阳底下', '阴阳路19：我对眼见到嘢', '当爱情失去记忆']
新名龙马 ['天赐良缘', '给自己的情书', '阿德尔曼夫妇', '龙狼血战', '三只小孤儿猫', '逆时·恒美', '毛驴县令之一奶同胞', '未来忍者', '爱在夏天', '恋人曲']
卡特斯拉 ['三只小孤儿猫', '夏日之王', '妈妈的记忆', '未来忍者', '太极旗飘扬', '浪漫杀手自由人', '太阳底下', '花眼', '我的脑内恋碍选项OVA', '想听到说相爱']
?. ['天赐良缘', '妈妈的记忆', '背叛的街角', '花眼', '党的女儿尹灵芝', '给自己的情书', '角斗英雄', '未来忍者', '星际牛仔：天国之门', '跟踪']
E.wong ['妈妈的记忆', '未来忍者', '背叛的街角', '沉睡的声音', '逆时·恒美', '太阳底下', '淘金俏冤家', '喜欢你', '轰天神鹰', '急速杀机']
艾托 ['蜡笔小新：奇异乐园大冒险', '留住有情人', '太阳底下', '南国少年巴布华', '逆时·恒美', '想听到说相爱', '眩：北斋之女', '三毛从军记', '时间遗忘的天使', '英雄本色2']
morgan ['逆时·恒美'

In [55]:
import redis
import traceback
#将推荐数据储存在redis里
def save_redis(items, db=1):
    redis_url = 'redis://username:password@127.0.0.1:6379/' + str(db)
    pool = redis.from_url(redis_url)
    try:
        for item in items:
            pool.set(item[0], item[1])
    except:
        traceback.print_exc()

#以uid为键保存topk推荐
save_redis(user_movie_info, db=15)