In [1]:
import numpy as np 
import math 
import time 
from tqdm.autonotebook import tqdm
import random
from collections import defaultdict



# 通用函数

In [2]:
# 定义时间装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print("Func {:s}, run time: {:f}".format(func.__name__, stop_time - start_time))
        return res 
    return wrapper

# 数据处理相关

In [3]:
# 数据集的读取
class Dataset:
    def __init__(self, fp):
        self.data = self.loadData(fp)
    
    @timmer
    def loadData(self, fp):
        data = []
        # 只取用户id和电影id
        for l in open(fp):
            data.append(tuple(map(int, l.strip().split("::")[:2])))
        return data 
    
    @timmer
    def splitData(self, M, k, seed=1):
        '''
        data: 加载所有(user_id, movie_id)数据条目
        M: 划分的数目，最后去M折的平均
        k: 本次是第几次划分，k~[0,M)
        seed: 随机种子数，对于不同k应该设置为相同值
        return train, test
        '''
        train, test = [], [] 
        random.seed(seed)
        for user, item in self.data:
            if random.randint(0, M-1) == k:
                test.append((user, item))
            else:
                train.append((user, item))
        
        ## 处理成字典形式
        def convert_dict(data):
            data_dict = defaultdict(set)
            for user, item in data: 
                data_dict[user].add(item)
            data_dict = {k: list(data_dict[k]) for k in data_dict}
            return data_dict 
        
        return convert_dict(train), convert_dict(test)

# 评价指标

In [4]:
class Metric:
    
    def __init__(self, train, test, GetRecommendation):
        '''
        GetRecommendation: 为某个用户推荐物品的接口函数，返回值为电影id组成的list
        '''
        self.train = train
        self.test = test 
        self.GetRecommendation = GetRecommendation
        # 保存为测试集用户推荐结果的值
        self.recs = self.getRec()
        
    # 为test中的每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs
    
    ## 定义精确率指标
    def precision(self):
        all, hit = 0, 0 
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1 
            all += len(rank)
        return round(hit / all * 100, 2)
    
    ## 定义召回率指标
    def recall(self):
        all, hit = 0, 0 
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank: 
                if item in test_items:
                    hit += 1 
            all += len(test_items)
        return round(hit / all * 100, 2)
    
    ## 定义覆盖率指标
    def coverage(self):
        all_item, recom_item = set(), set()
        for user in self.test:
            for item in self.train[user]:
                all_item.add(item)
            rank = self.recs[user]
            for item, score in rank: 
                recom_item.add(item)
                
        return round(len(recom_item) / len(all_item) * 100, 2)
    
    ## 定义新颖度指标
    def popularity(self):
        ## 计算推荐物品的平均流行度
        item_pop = {}
        for user in self.train: 
            for item in self.train[user]:
                item_pop[item] = item_pop.get(item, 0) + 1 
        num, pop = 0, 0 
        for user in self.test: 
            rank = self.recs[user]
            for item, score in rank:
                # 取对数，防止长尾问题带来被流行物品主导的问题
                pop += math.log(1+item_pop[item])
                num += 1 
        return round(pop / num, 6)
    
    def eval(self):
        metric = {
            "Precision": self.precision(),
            "Recall": self.recall(),
            "Coverage": self.coverage(),
            "Popularity": self.popularity()
        }
        print("Metric:  ", metric)
        return metric

# 算法实现

- ItemCF
- ItemIUF
- ItemIUF_Norm

In [5]:
# 基于物品的余弦相似度推荐
def ItemCF(train, K, N):
    # 计算物品的相似度矩阵
    sim = defaultdict(dict)
    num = {}
    for user in train: 
        items = train[user]
        ## 对于每一个物品
        for i, u in enumerate(items):
            num[u] = num.get(u, 0) + 1 
            for j, v in enumerate(items):
                if j == i: continue
                sim[u][v] = sim[u].get(v, 0) + 1 
    
    ## 对于相似度矩阵中的每一个物品
    for u in sim: 
        for v in sim[u]:
            sim[u][v] /= math.sqrt(num[u]*num[v])
    
    ## 按照相似度排序
    sorted_item_sim = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) for k, v in sim.items()}
    
    # 获取接口函数
    def GetRecommendation(user):
        items = {}
        ## 表示训练集中出现的用户看过的电影
        seen_items = set(train[user])
        ## 对于用户看过的电影
        for item in train[user]:
            ## 找到和用户看过电影相似的前K个
            for u, _ in sorted_item_sim[item][:K]:
                ## 如果找到的电影用户没有看过
                ## 则加上对应电影的相似度
                if u not in seen_items:
                    items[u] = items.get(u, 0) + sim[item][u]
        recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs
    
    return GetRecommendation
            

In [6]:
# 基于改进的物品余弦相似度推荐
def ItemIUF(train, K, N):
    # 计算物品的相似度矩阵
    sim = defaultdict(dict)
    num = {}
    for user in train: 
        items = train[user]
        ## 对于每一个物品
        for i, u in enumerate(items):
            num[u] = num.get(u, 0) + 1 
            for j, v in enumerate(items):
                if j == i: continue
                ## 相比上面，主要改进在这里
                sim[u][v] = sim[u].get(v, 0) + 1 / math.log(1+len(items))
    
    ## 对于相似度矩阵中的每一个物品
    for u in sim: 
        for v in sim[u]:
            sim[u][v] /= math.sqrt(num[u]*num[v])
    
    ## 按照相似度排序
    sorted_item_sim = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) for k, v in sim.items()}
    
    # 获取接口函数
    def GetRecommendation(user):
        items = {}
        ## 表示训练集中出现的用户看过的电影
        seen_items = set(train[user])
        ## 对于用户看过的电影
        for item in train[user]:
            ## 找到和用户看过电影相似的前K个
            for u, _ in sorted_item_sim[item][:K]:
                ## 如果找到的电影用户没有看过
                ## 则加上对应电影的相似度
                if u not in seen_items:
                    items[u] = items.get(u, 0) + sim[item][u]
        recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs
    
    return GetRecommendation
            

In [7]:
# 基于改进的物品余弦相似度推荐
def ItemIUF_Norm(train, K, N):
    # 计算物品的相似度矩阵
    sim = defaultdict(dict)
    num = {}
    for user in train: 
        items = train[user]
        ## 对于每一个物品
        for i, u in enumerate(items):
            num[u] = num.get(u, 0) + 1 
            for j, v in enumerate(items):
                if j == i: continue
                ## 相比上面，主要改进在这里
                sim[u][v] = sim[u].get(v, 0) + 1 / math.log(1+len(items))
    
    ## 对于相似度矩阵中的每一个物品
    for u in sim: 
        for v in sim[u]:
            sim[u][v] /= math.sqrt(num[u]*num[v])
    
    ## 对每个物品的相似度矩阵进行归一化
    for u in sim:
        s = []  
        for v in sim[u]:
            s.append(sim[u][v])
        
        s_max = max(s)
        s_min = min(s)
        s_gap = s_max - s_min
        
        if s_gap > 0:
            for v in sim[u]:
                sim[u][v] = (sim[u][v] - s_min) / s_gap
        else:
            for v in sim[u]:
                sim[u][v] = 1
    
    
    
    ## 按照相似度排序
    sorted_item_sim = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) for k, v in sim.items()}
    
    # 获取接口函数
    def GetRecommendation(user):
        items = {}
        ## 表示训练集中出现的用户看过的电影
        seen_items = set(train[user])
        ## 对于用户看过的电影
        for item in train[user]:
            ## 找到和用户看过电影相似的前K个
            for u, _ in sorted_item_sim[item][:K]:
                ## 如果找到的电影用户没有看过
                ## 则加上对应电影的相似度
                if u not in seen_items:
                    items[u] = items.get(u, 0) + sim[item][u]
        recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs
    
    return GetRecommendation

# 实验

- ItemCF实验，K=[5, 10, 20 ,40, 80, 160]
- ItemIUF实验，K=10
- ItemCF_Norm实验，K=10

M = 8, N = 10 

In [9]:
class Experiment:
    def __init__(self, M, K, N, fp="data/ml-1m/ratings.dat", rt="UserCF"):
        self.M = M 
        self.K = K
        self.N = N 
        self.fp = fp 
        self.rt = rt 
        self.alg = {"ItemCF": ItemCF, "ItemIUF": ItemIUF, "ItemIUF_Norm": ItemIUF_Norm}
        
    # 定义单次实验
    @timmer
    def worker(self, train, test):
        getRecommendation = self.alg[self.rt](train, self.K, self.N)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()
    
    # 多次实验取平均
    @timmer
    def run(self):
        metrics = {"Precision": 0, "Recall": 0,
                  "Coverage": 0, "Popularity": 0}
        dataset = Dataset(self.fp)
        for ii in range(self.M):
            train, test = dataset.splitData(self.M, ii)
            print(f"Experiment {ii}: ")
            metric = self.worker(train, test)
            metrics = {k: metrics[k]+metric[k] for k in metrics}
        
        metrics = {k: metrics[k] / self.M for k in metrics}
        
        print(f"Average Result (M={self.M}, K={self.K}, N={self.N}): {metrics}")

## ItemCF实验

In [12]:
M, N = 8, 10 
for K in [5, 10, 20, 40, 80, 160]:
    print(f"K = {K}")
    cf_exp = Experiment(M, K, N, rt="ItemCF")
    cf_exp.run()

K = 5
Func loadData, run time: 0.760128
Func splitData, run time: 1.034323
Experiment 0: 
Metric:   {'Precision': 21.29, 'Recall': 10.22, 'Coverage': 21.3, 'Popularity': 7.167103}
Func worker, run time: 67.470705
Func splitData, run time: 1.034475
Experiment 1: 
Metric:   {'Precision': 21.45, 'Recall': 10.27, 'Coverage': 21.85, 'Popularity': 7.151314}
Func worker, run time: 67.601480
Func splitData, run time: 1.135150
Experiment 2: 
Metric:   {'Precision': 21.3, 'Recall': 10.18, 'Coverage': 22.03, 'Popularity': 7.165002}
Func worker, run time: 66.531618
Func splitData, run time: 1.129149
Experiment 3: 
Metric:   {'Precision': 21.17, 'Recall': 10.18, 'Coverage': 21.34, 'Popularity': 7.178365}
Func worker, run time: 67.569018
Func splitData, run time: 1.032109
Experiment 4: 
Metric:   {'Precision': 21.21, 'Recall': 10.2, 'Coverage': 21.8, 'Popularity': 7.170794}
Func worker, run time: 66.888301
Func splitData, run time: 1.032360
Experiment 5: 
Metric:   {'Precision': 21.39, 'Recall': 10.

Func splitData, run time: 1.151029
Experiment 1: 
Metric:   {'Precision': 19.76, 'Recall': 9.46, 'Coverage': 12.5, 'Popularity': 7.368402}
Func worker, run time: 96.679556
Func splitData, run time: 1.046546
Experiment 2: 
Metric:   {'Precision': 19.68, 'Recall': 9.41, 'Coverage': 11.96, 'Popularity': 7.379513}
Func worker, run time: 97.383067
Func splitData, run time: 1.053457
Experiment 3: 
Metric:   {'Precision': 19.4, 'Recall': 9.32, 'Coverage': 12.08, 'Popularity': 7.389774}
Func worker, run time: 97.731717
Func splitData, run time: 1.044704
Experiment 4: 
Metric:   {'Precision': 19.26, 'Recall': 9.26, 'Coverage': 12.21, 'Popularity': 7.385536}
Func worker, run time: 98.504062
Func splitData, run time: 1.061249
Experiment 5: 
Metric:   {'Precision': 19.06, 'Recall': 9.19, 'Coverage': 12.22, 'Popularity': 7.379692}
Func worker, run time: 103.585943
Func splitData, run time: 1.089876
Experiment 6: 
Metric:   {'Precision': 19.25, 'Recall': 9.25, 'Coverage': 11.95, 'Popularity': 7.3743

# ItemIUF实验

In [14]:
# M表示几折，K表示选择最相似的前K个物品进行计算，N表示选择前N个进行推荐
M, N = 8, 10
K = 10
iuf_exp = Experiment(M, K, N, rt="ItemIUF")
iuf_exp.run()

Func loadData, run time: 0.804874
Func splitData, run time: 1.082875
Experiment 0: 
Metric:   {'Precision': 22.51, 'Recall': 10.81, 'Coverage': 17.53, 'Popularity': 7.346247}
Func worker, run time: 151.033581
Func splitData, run time: 1.150083
Experiment 1: 
Metric:   {'Precision': 22.87, 'Recall': 10.95, 'Coverage': 17.43, 'Popularity': 7.346612}
Func worker, run time: 147.101052
Func splitData, run time: 1.170229
Experiment 2: 
Metric:   {'Precision': 22.93, 'Recall': 10.96, 'Coverage': 17.86, 'Popularity': 7.353326}
Func worker, run time: 143.943954
Func splitData, run time: 1.160159
Experiment 3: 
Metric:   {'Precision': 22.5, 'Recall': 10.82, 'Coverage': 17.55, 'Popularity': 7.347087}
Func worker, run time: 143.605090
Func splitData, run time: 1.049845
Experiment 4: 
Metric:   {'Precision': 22.23, 'Recall': 10.69, 'Coverage': 17.62, 'Popularity': 7.355618}
Func worker, run time: 141.331724
Func splitData, run time: 1.026713
Experiment 5: 
Metric:   {'Precision': 22.73, 'Recall': 1

In [15]:
# M表示几折，K表示选择最相似的前K个物品进行计算，N表示选择前N个进行推荐
M, N = 8, 10
K = 10
iufnorm_exp = Experiment(M, K, N, rt="ItemIUF_Norm")
iufnorm_exp.run()

Func loadData, run time: 0.758595
Func splitData, run time: 1.215408
Experiment 0: 
Metric:   {'Precision': 23.22, 'Recall': 11.16, 'Coverage': 22.68, 'Popularity': 7.236576}
Func worker, run time: 143.391394
Func splitData, run time: 1.166116
Experiment 1: 
Metric:   {'Precision': 23.59, 'Recall': 11.29, 'Coverage': 22.97, 'Popularity': 7.234724}
Func worker, run time: 143.808425
Func splitData, run time: 1.044035
Experiment 2: 
Metric:   {'Precision': 23.74, 'Recall': 11.35, 'Coverage': 22.87, 'Popularity': 7.24085}
Func worker, run time: 143.561788
Func splitData, run time: 1.043489
Experiment 3: 
Metric:   {'Precision': 23.24, 'Recall': 11.17, 'Coverage': 22.45, 'Popularity': 7.238275}
Func worker, run time: 140.527070
Func splitData, run time: 1.039667
Experiment 4: 
Metric:   {'Precision': 23.28, 'Recall': 11.19, 'Coverage': 22.29, 'Popularity': 7.249065}
Func worker, run time: 140.112039
Func splitData, run time: 1.036376
Experiment 5: 
Metric:   {'Precision': 23.67, 'Recall': 1