In [42]:
import numpy
import random
import pandas as pd
import math

In [43]:
data = pd.read_csv('../data/ml-25m/ratings.csv')
print(data.head())

   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


In [44]:
data = data.drop(columns=['rating', 'timestamp'])
print(data.head())

   userId  movieId
0       1      296
1       1      306
2       1      307
3       1      665
4       1      899


In [45]:
data = data.rename(columns={'userId': 'user', 'movieId': 'item'})
print(data.head())
data = data.values.tolist()
# print(data)

   user  item
0     1   296
1     1   306
2     1   307
3     1   665
4     1   899


In [52]:
def splitData(data, M, k, seed):
    test = dict()
    train = dict()
    random.seed(seed)
    for user, item in data:
        if random.randint(0, M) == k:
            # test[user] = ([item, rating])
            if user not in test.keys():
                test[user] = []
            test[user].append(item)
        else:
            # train[user] = ([item, rating])
            if user not in train.keys():
                train[user] = []
            train[user].append(item)
    return train, test

In [53]:
train, test = splitData(data, M=8, k=4, seed=123)

In [54]:
def recall(train, test, N):
    '''
    召回率 描述有多少比例的评分记录包含在最终的推荐列表中
    :param train:
    :param test:
    :param N:
    :return:
    '''
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]  # 用户在测试集上喜欢的物品的集合
        rank = getRec(user, N)  # 模型得到的结果
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)

In [55]:
def precision(train, test, N):
    '''
    准确率 描述最终的推荐列表中有多少比例是发生过的评分记录
    :param train:
    :param test:
    :param N:
    :return:
    '''
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
        rank = getRec(user, N)  # 如果我们需要计算多个判断标准，这里可能会导致重复计算 把getRec做一个懒加载，存储所有的rec记录，没有的则计算
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)

In [56]:
def coverage(train, test, N):
    '''
    覆盖率 最终的推荐列表中包含多大比例的物品
    :param train:
    :param test:
    :param N:
    :return:
    '''
    rec_items = set()
    all_items = set()
    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)
        rank = getRec(user, N)
        for item, _ in rank:
            rec_items.add(item)
    return len(rec_items) / (len(all_items) * 1.0)

In [57]:
def popularity(train, test, N):
    '''
    新颖度 用推荐列表中物品的平均流行度度量推荐列表的新颖度
    如果推荐的物品都很热门，则说明推荐的新颖度很低，否则说明推荐结果比较新颖
    :param train:
    :param test:
    :param N:
    :return:
    '''
    item_pop = dict()
    for user, items in train.items():
        for item in items.keys():
            if item not in item_pop:
                item_pop[item] = 0
            item_pop += 1  # 出现在用户购买记录中的次数 次数越多越流行，越不新颖
    ret = 0
    n = 0
    for user in train.keys():
        rank = getRec(user, N)
        for item, _ in rank:
            ret += math.log(1 + item_pop[item])  # 乘法变加法 取消梯度爆炸/消失效应 （书中的解释是物品的流行度分布满足长尾分布，取对数后流行度的平均值更稳定。
            # 出现次数越多，ret越大
            n += 1
    ret /= n * 1.0  # 那应该是，返回结果越小，新颖度越高？ # TODO
    return ret

In [68]:
def userSimilarity_cosine(train):
    w = dict()
    for u in train.keys():
        for v in train.keys():
            if u == v:
                continue
            w[u][v] = len(train[u] & train[v])  # 求交集
            w[u][v] /= math.sqrt(len(train[u]) * len(train[v]) * 1.0)  # 余弦相似度
    return w


def userSimilarity(train):
    item_users = dict()
    for u, items in train.items():
        for i in items:
            if i not in item_users:
                item_users[i] = set()
            item_users[i].add(u)
    C = dict()
    N = dict()
    for i, users in item_users.items():
        for u in users:
            if u not in N:
                N[u] = 0
            N[u] += 1
            for v in users:
                if u == v:
                    continue
                if (u, v) not in C:
                    C[(u, v)] = 0
                C[(u, v)] += 1
    w = dict()
    for u, related_users in C.items():
        for v, cuv in related_users.items():
            w[(u, v)] = cuv / math.sqrt(N[u] * N[v])
    return w

In [61]:
def getRec(user, train, W, K):
    '''
    针对用户user，给出推荐列表rank
    :param user:
    :param train:
    :param W: 用户相似度矩阵
    :param K: 根据相似度最高的K个用户计算
    :return:
    '''
    rank = dict()
    interacted_items = train[user]  # 用户曾打分的电影
    for v, wuv in sorted(W[user].items, key=lambda x: x[1], reverse=True)[0:K]:  # 取用户相似度最高的K个用户
        for i in train[v]:  # i 表示物品 rvi表示评分
            if i in interacted_items:  # 如果用户曾经给这个物品打分过，就跳过这个
                continue
            rank[i] += math.log(wuv) + math.log(1.0)  # 取log避免梯度消失和梯度爆炸效应
    return rank

In [69]:
W = userSimilarity(train)
# W_test = userSimilarity(test)
N = 10

recall_value = recall(train, test, N)
print('recall: ', recall_value)
precision_value = precision(train, test, N)
print('precision: ', precision_value)
popularity_value = popularity(train, test, N)
print('popularity: ', popularity_value)
coverage_value = coverage(train, test, N)
print('coverage: ', coverage_value)

MemoryError: 