# 物品冷启动-利用物品的内容信息

In [1]:
# 导入包
import random
import math
import numpy as np
import time
from tqdm import tqdm, trange

## 一. 通用函数定义

In [2]:
# 定义装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print('Func %s, run time: %s' % (func.__name__, stop_time - start_time))
        return res
    return wrapper

### 1. 数据处理相关
1. load data
2. split data

In [3]:
class Dataset():
    
    def __init__(self, fp, ip):
        # fp: data file path
        self.data, self.content = self.loadData(fp, ip)
    
    @timmer
    def loadData(self, fp, ip):
        data = []
        for l in open(fp):
            data.append(tuple(map(int, l.strip().split('::')[:2])))
        contents = {}
        for l in open(ip, 'rb'):
            l = str(l)[2:-1]
            contents[int(l.strip().split('::')[0])] = l.strip().split('::')[-1].split('|')
        return data, contents
    
    @timmer
    def splitData(self, M, k, seed=1):
        '''
        :params: data, 加载的所有(user, item)数据条目
        :params: M, 划分的数目，最后需要取M折的平均
        :params: k, 本次是第几次划分，k~[0, M)
        :params: seed, random的种子数，对于不同的k应设置成一样的
        :return: train, test
        '''
        train, test = [], []
        random.seed(seed)
        for user, item in self.data:
            # 这里与书中的不一致，本人认为取M-1较为合理，因randint是左右都覆盖的
            if random.randint(0, M-1) == k:  
                test.append((user, item))
            else:
                train.append((user, item))

        # 处理成字典的形式，user->set(items)
        def convert_dict(data):
            data_dict = {}
            for user, item in data:
                if user not in data_dict:
                    data_dict[user] = set()
                data_dict[user].add(item)
            data_dict = {k: list(data_dict[k]) for k in data_dict}
            return data_dict

        return convert_dict(train), convert_dict(test), self.content

### 2. 评价指标
1. Precision
2. Recall
3. Coverage
4. Popularity(Novelty)

In [4]:
class Metric():
    
    def __init__(self, train, test, GetRecommendation):
        '''
        :params: train, 训练数据
        :params: test, 测试数据
        :params: GetRecommendation, 为某个用户获取推荐物品的接口函数
        '''
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.recs = self.getRec()
        
    # 为test中的每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs
        
    # 定义精确率指标计算方式
    def precision(self):
        all, hit = 0, 0
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1
            all += len(rank)
        return round(hit / all * 100, 2)
    
    # 定义召回率指标计算方式
    def recall(self):
        all, hit = 0, 0
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1
            all += len(test_items)
        return round(hit / all * 100, 2)
    
    # 定义覆盖率指标计算方式
    def coverage(self):
        all_item, recom_item = set(), set()
        for user in self.test:
            for item in self.train[user]:
                all_item.add(item)
            rank = self.recs[user]
            for item, score in rank:
                recom_item.add(item)
        return round(len(recom_item) / len(all_item) * 100, 2)
    
    # 定义新颖度指标计算方式
    def popularity(self):
        # 计算物品的流行度
        item_pop = {}
        for user in self.train:
            for item in self.train[user]:
                if item not in item_pop:
                    item_pop[item] = 0
                item_pop[item] += 1

        num, pop = 0, 0
        for user in self.test:
            rank = self.recs[user]
            for item, score in rank:
                if item in item_pop:
                    # 取对数，防止因长尾问题带来的被流行物品所主导
                    pop += math.log(1 + item_pop[item])
                    num += 1
        return round(pop / num, 6)
    
    def eval(self):
        metric = {'Precision': self.precision(),
                  'Recall': self.recall(),
                  'Coverage': self.coverage(),
                  'Popularity': self.popularity()}
        print('Metric:', metric)
        return metric

## 二. ContentItemKNN算法实现

In [10]:
#物品u,v的相似性由物品u,v的关键词向量的余弦相似性来表示
def ContentItemKNN(train, content, K, N):
    '''
    :params: train, 训练数据
    :params: content, 物品内容信息
    :params: K, 取相似Top-K相似物品
    :params: N, 推荐TopN物品的个数
    :return: GetRecommendation, 获取推荐结果的接口
    '''
    
    # 建立关键词-物品(word-item)倒排表
    word_item = {}
    item_word={}
    for item in content:
        for word in content[item]:
            if word not in word_item:
                word_item[word] = {}
            if item not in item_word:
                item_word[item] = {}
            #可疑pos1(已排除)
            word_item[word][item] = 1 #content中的物品是不重复的，所以其实不用累计，当然下面的写法更严谨
            if word not in item_word[item]:
                item_word[item][word] = 0
            item_word[item][word] += 1
            
#             if item not in word_item[word]:
#                 word_item[word][item] = 0
#             word_item[word][item] += 1
    
    #求物品的关键词向量(word[:][item])
#     print("len(content):")
#     print(len(content))
#     print(item_word)#tf全为1
    for word in word_item:
        for item in word_item[word]:
            #可疑pos2(已排除)
            #按关键词出现的频率(有多少物品含有该关键词)进行惩罚
            #如果word只为几个物品所有，则word对表征这几个物品的相似性很有帮助
            #如果word所有的物品都有，则word对表征物品的相似性没有帮助
            #衡量关键词与物品的关系(相似性)，即item在word维度上的权重
#             word_item[word][item] /= math.log(1 + len(word_item[word])) 
            word_item[word][item] *= math.log(len(content)/(1 + len(word_item[word]))) #idf
#             word_item[word][item] *= item_word[item][word]*math.log(len(content)/(1 + len(word_item[word]))) #tf-idf
            #word_item[word][item] /= len(word_item[word])
            
    # 计算相似度
    item_sim = {}
    mo = {}
    for word in word_item:
        for u in word_item[word]:
            if u not in item_sim:
                item_sim[u] = {}
                mo[u] = 0
            mo[u] += word_item[word][u] ** 2 #物品u在所有关键词维度上的权重的平方和，为了求u的关键词向量的模
            for v in word_item[word]:
                if u == v: continue
                if v not in item_sim[u]:
                    item_sim[u][v] = 0
                #通过关键词与物品的关系来建立物品与物品之间的关系
                #可疑pos3
                item_sim[u][v] += word_item[word][u] * word_item[word][v] #求u,v的关键词向量的点积
                
    #为什么余弦相似度不要分母(注掉这部分)效果反而好很多？
    for u in item_sim:
        for v in item_sim[u]:
            #可疑pos4
            #if u == v: continue 没影响
            item_sim[u][v] /= math.sqrt(mo[u] * mo[v]) #余弦相似度
                
    # 按照相似度排序
    sorted_item_sim = {k: list(sorted(v.items(), \
                               key=lambda x: x[1], reverse=True)) \
                       for k, v in item_sim.items()}
        
    # 获取接口函数
    def GetRecommendation(user):
        items = {}
#         print("user")
#         print(user)
        seen_items = set(train[user])
        for item in train[user]:
#             print("sorted_item_sim[item]:")
#             print(sorted_item_sim[item])
            for u, _ in sorted_item_sim[item][:K]: #每次选择与已见item内容相似性最高的K个物品(排除掉已见的)
                # 要去掉用户见过的
                if u not in seen_items:
                    if u not in items:
                        items[u] = 0
                    items[u] += item_sim[item][u] #累计已见item与新物品u的相似性
        #从约len(train[user])*K 个新物品中选N个
        recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N] 
        return recs
    
    return GetRecommendation

## 三. ContentItemKNN实验
M=8, N=10, K=10

In [6]:
class Experiment():
    
    def __init__(self, M, N, K, fp='../dataset/ml-1m/ratings.dat', ip='../dataset/ml-1m/movies.dat'):
        '''
        :params: M, 进行多少次实验，这是交叉验证的折数
        :params: N, TopN推荐物品的个数
        :params: K, 取Top-K相似物品数目
        :params: fp, 数据文件路径
        :params: ip, 物品内容文件路径
        '''
        self.M = M
        self.K = K
        self.N = N
        self.fp = fp
        self.ip = ip
        self.alg = ContentItemKNN
    
    # 定义单次实验
    @timmer
    def worker(self, train, test, content):
        '''
        :params: train, 训练数据集
        :params: test, 测试数据集
        :return: 各指标的值
        '''
        getRecommendation = self.alg(train, content, self.K, self.N)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()
    
    # 多次实验取平均
    @timmer
    def run(self):
        metrics = {'Precision': 0, 'Recall': 0, 
                   'Coverage': 0, 'Popularity': 0}
        dataset = Dataset(self.fp, self.ip)
        for ii in range(self.M):
            train, test, content = dataset.splitData(self.M, ii)
            print('Experiment {}:'.format(ii))
            metric = self.worker(train, test, content)
            metrics = {k: metrics[k]+metric[k] for k in metrics}
        metrics = {k: metrics[k] / self.M for k in metrics}
        print('Average Result (M={}, N={}, K={}): {}'.format(\
                              self.M, self.N, self.K, metrics))

In [13]:
M, N, K = 5, 10, 10#M=1为啥会报错？，M的大小会对结果有很大影响，感觉问题可能就在这里
exp = Experiment(M, N, K)
exp.run()
#M=2时，Average Result (M=2, N=10, K=10): {'Precision': 6.255, 'Recall': 0.755, 'Coverage': 20.509999999999998, 'Popularity': 4.1529605}
#M=8时，Average Result (M=8, N=10, K=10): {'Precision': 1.8075, 'Recall': 0.86625, 'Coverage': 16.4425, 'Popularity': 4.616854}
#M=5时，Average Result (M=5, N=10, K=10): {'Precision': 2.7939999999999996, 'Recall': 0.8400000000000001, 'Coverage': 17.242, 'Popularity': 4.5457862}

Func loadData, run time: 2.208726167678833
Func splitData, run time: 4.945830345153809
Experiment 0:
Metric: {'Precision': 2.84, 'Recall': 0.85, 'Coverage': 17.07, 'Popularity': 4.547464}
Func worker, run time: 12.965397357940674
Func splitData, run time: 2.859919548034668
Experiment 1:
Metric: {'Precision': 2.77, 'Recall': 0.83, 'Coverage': 17.38, 'Popularity': 4.544962}
Func worker, run time: 11.274342775344849
Func splitData, run time: 2.8373403549194336
Experiment 2:
Metric: {'Precision': 2.78, 'Recall': 0.84, 'Coverage': 17.01, 'Popularity': 4.568636}
Func worker, run time: 10.99608588218689
Func splitData, run time: 2.850891590118408
Experiment 3:
Metric: {'Precision': 2.84, 'Recall': 0.86, 'Coverage': 17.21, 'Popularity': 4.540507}
Func worker, run time: 10.817004442214966
Func splitData, run time: 2.9173104763031006
Experiment 4:
Metric: {'Precision': 2.74, 'Recall': 0.82, 'Coverage': 17.54, 'Popularity': 4.527362}
Func worker, run time: 11.147749185562134
Average Result (M=5, 

# 四. 实验结果
1. Random实验（复制第二章的结果）   

    Running time: 185.54872608184814
    
    Average Result (M=8, K=0, N=10): 
    {'Precision': 0.61, 'Recall': 0.29, 
     'Coverage': 100.0, 'Popularity': 4.38958}
 
2. MostPopular实验（复制第二章的结果）

    Running time: 103.3697898387909
    
    Average Result (M=8, K=0, N=10): 
    {'Precision': 12.83, 'Recall': 6.16, 
    'Coverage': 2.43, 'Popularity': 7.72326}
    
3. ItemCF实验（复制第二章的结果）

    Running time: 835.2476677894592
    
    Average Result (M=8, K=10, N=10): 
    {'Precision': 22.17, 'Recall': 10.65, 
     'Coverage': 19.11, 'Popularity': 7.2495425}

4. ContentItemKNN实验

    Running time: 76.9517409801483
    
    Average Result (M=8, N=10, K=10): {'Precision': 1.80, 'Recall': 0.865, 'Coverage': 16.676, 'Popularity': 4.61807}

# 五. 问题
实验结果与书中的不符合(大多数指标明显偏低)，不知道是否是实现错误。
可能的原因：
1. 可疑pos1 word_item[word][item]没有累计（已排除）
(1).  word_item[word][item] = 1 的结果：
Average Result (M=8, N=10, K=10): {'Precision': 1.79875, 'Recall': 0.86375, 'Coverage': 16.61625, 'Popularity': 4.617733125000001}
(2).  word_item[word][item] += 1 的结果：
Average Result (M=8, N=10, K=10): {'Precision': 1.79875, 'Recall': 0.86375, 'Coverage': 16.61625, 'Popularity': 4.617733125000001}

结果一模一样，原因是：content中的物品是不重复的，所以其实不用累计

2. 可疑pos2 影响不大
注掉之后：Average Result (M=8, N=10, K=10): {'Precision': 1.9137499999999998, 'Recall': 0.91875, 'Coverage': 16.1325, 'Popularity': 4.655494}

3. 可疑pos3

4. **可疑pos4 注掉之后效果显著提升，说明主要就是余弦相似度的分母的问题，但为什么余弦相似度不要分母效果反而好？**
Average Result (M=8, N=10, K=10): {'Precision': 5.01625, 'Recall': 2.4099999999999997, 'Coverage': 14.30125, 'Popularity': 5.658101375}

5. 同时注掉pos2,pos4:
Average Result (M=8, N=10, K=10): {'Precision': 4.22375, 'Recall': 2.0262499999999997, 'Coverage': 11.79125, 'Popularity': 5.6080193750000005}
Func run, run time: 32.52777910232544

6. 更改了关键词向量的计算方式 word_item[word][item] /= len(word_item[word])
Average Result (M=8, N=10, K=10): {'Precision': 1.7887500000000003, 'Recall': 0.85875, 'Coverage': 16.535, 'Popularity': 4.613214875000001}

7. 同时做4,6：说明关键词向量的计算方式还是原来的好
Average Result (M=8, N=10, K=10): {'Precision': 2.45, 'Recall': 1.17625, 'Coverage': 12.22875, 'Popularity': 5.431244625}

# 附录：日志（请双击查看）
1. Random实验（复制第二章的结果）
Func loadData, run time: 1.40358304977417
Func splitData, run time: 2.1179611682891846
Experiment 0:
Metric: {'Precision': 0.61, 'Recall': 0.29, 'Coverage': 100.0, 'Popularity': 4.3952}
Func worker, run time: 20.80728793144226
Func splitData, run time: 2.039689064025879
Experiment 1:
Metric: {'Precision': 0.61, 'Recall': 0.29, 'Coverage': 100.03, 'Popularity': 4.384244}
Func worker, run time: 22.058059692382812
Func splitData, run time: 2.129431962966919
Experiment 2:
Metric: {'Precision': 0.64, 'Recall': 0.3, 'Coverage': 100.0, 'Popularity': 4.38938}
Func worker, run time: 18.35742425918579
Func splitData, run time: 2.0330629348754883
Experiment 3:
Metric: {'Precision': 0.62, 'Recall': 0.3, 'Coverage': 100.0, 'Popularity': 4.393025}
Func worker, run time: 22.459643840789795
Func splitData, run time: 2.0501880645751953
Experiment 4:
Metric: {'Precision': 0.61, 'Recall': 0.3, 'Coverage': 100.0, 'Popularity': 4.393217}
Func worker, run time: 23.829069137573242
Func splitData, run time: 1.952528953552246
Experiment 5:
Metric: {'Precision': 0.57, 'Recall': 0.27, 'Coverage': 100.03, 'Popularity': 4.388441}
Func worker, run time: 21.796540021896362
Func splitData, run time: 2.1322124004364014
Experiment 6:
Metric: {'Precision': 0.61, 'Recall': 0.29, 'Coverage': 100.0, 'Popularity': 4.382586}
Func worker, run time: 19.419902801513672
Func splitData, run time: 1.9659440517425537
Experiment 7:
Metric: {'Precision': 0.61, 'Recall': 0.29, 'Coverage': 100.0, 'Popularity': 4.390586}
Func worker, run time: 18.834516286849976
Average Result (M=8, K=0, N=10): {'Precision': 0.61, 'Recall': 0.29125, 'Coverage': 100.0075, 'Popularity': 4.389584875000001}
Func run, run time: 185.54872608184814

2. MostPopular实验（复制第二章的结果）
Func loadData, run time: 1.403282880783081
Func splitData, run time: 1.9211320877075195
Experiment 0:
Metric: {'Precision': 12.85, 'Recall': 6.17, 'Coverage': 2.47, 'Popularity': 7.724273}
Func worker, run time: 10.972801923751831
Func splitData, run time: 1.9256069660186768
Experiment 1:
Metric: {'Precision': 13.07, 'Recall': 6.26, 'Coverage': 2.28, 'Popularity': 7.721385}
Func worker, run time: 10.841933012008667
Func splitData, run time: 1.910295009613037
Experiment 2:
Metric: {'Precision': 12.89, 'Recall': 6.16, 'Coverage': 2.44, 'Popularity': 7.722067}
Func worker, run time: 10.727141857147217
Func splitData, run time: 1.882903814315796
Experiment 3:
Metric: {'Precision': 12.81, 'Recall': 6.15, 'Coverage': 2.49, 'Popularity': 7.723152}
Func worker, run time: 10.670467138290405
Func splitData, run time: 1.918154001235962
Experiment 4:
Metric: {'Precision': 12.7, 'Recall': 6.11, 'Coverage': 2.47, 'Popularity': 7.724644}
Func worker, run time: 10.960633993148804
Func splitData, run time: 1.9205529689788818
Experiment 5:
Metric: {'Precision': 12.9, 'Recall': 6.22, 'Coverage': 2.38, 'Popularity': 7.7234}
Func worker, run time: 10.842862129211426
Func splitData, run time: 1.9104499816894531
Experiment 6:
Metric: {'Precision': 12.91, 'Recall': 6.21, 'Coverage': 2.47, 'Popularity': 7.721658}
Func worker, run time: 10.716413974761963
Func splitData, run time: 1.9528350830078125
Experiment 7:
Metric: {'Precision': 12.53, 'Recall': 6.01, 'Coverage': 2.41, 'Popularity': 7.725531}
Func worker, run time: 10.732755184173584
Average Result (M=8, K=0, N=10): {'Precision': 12.832500000000001, 'Recall': 6.16125, 'Coverage': 2.42625, 'Popularity': 7.723263749999999}
Func run, run time: 103.3697898387909

3. ItemCF实验（复制第二章的结果）
Func loadData, run time: 1.2348299026489258
Func splitData, run time: 1.8201029300689697
Experiment 0:
Metric: {'Precision': 22.01, 'Recall': 10.57, 'Coverage': 19.35, 'Popularity': 7.248504}
Func worker, run time: 104.0655460357666
Func splitData, run time: 1.8287677764892578
Experiment 1:
Metric: {'Precision': 22.12, 'Recall': 10.59, 'Coverage': 18.95, 'Popularity': 7.244242}
Func worker, run time: 103.43892693519592
Func splitData, run time: 1.804075002670288
Experiment 2:
Metric: {'Precision': 22.59, 'Recall': 10.8, 'Coverage': 19.19, 'Popularity': 7.245515}
Func worker, run time: 103.44988584518433
Func splitData, run time: 1.7733349800109863
Experiment 3:
Metric: {'Precision': 22.02, 'Recall': 10.58, 'Coverage': 19.37, 'Popularity': 7.245227}
Func worker, run time: 104.05003190040588
Func splitData, run time: 1.8094689846038818
Experiment 4:
Metric: {'Precision': 22.11, 'Recall': 10.63, 'Coverage': 19.33, 'Popularity': 7.260709}
Func worker, run time: 100.68873810768127
Func splitData, run time: 1.7294957637786865
Experiment 5:
Metric: {'Precision': 22.17, 'Recall': 10.69, 'Coverage': 19.02, 'Popularity': 7.251251}
Func worker, run time: 101.01811790466309
Func splitData, run time: 1.73459792137146
Experiment 6:
Metric: {'Precision': 22.4, 'Recall': 10.77, 'Coverage': 18.48, 'Popularity': 7.24112}
Func worker, run time: 101.37971901893616
Func splitData, run time: 1.7321960926055908
Experiment 7:
Metric: {'Precision': 21.98, 'Recall': 10.54, 'Coverage': 19.18, 'Popularity': 7.259772}
Func worker, run time: 101.52781391143799
Average Result (M=8, K=10, N=10): {'Precision': 22.174999999999997, 'Recall': 10.646249999999998, 'Coverage': 19.10875, 'Popularity': 7.2495425}
Func run, run time: 835.2476677894592

4. ContentItemKNN实验
Func loadData, run time: 1.8746070861816406
Func splitData, run time: 2.308419942855835
Experiment 0:
Metric: {'Precision': 1.65, 'Recall': 0.79, 'Coverage': 17.13, 'Popularity': 4.616362}
Func worker, run time: 8.00853180885315
Func splitData, run time: 2.2228479385375977
Experiment 1:
Metric: {'Precision': 1.81, 'Recall': 0.87, 'Coverage': 16.81, 'Popularity': 4.611126}
Func worker, run time: 6.897377967834473
Func splitData, run time: 1.8748629093170166
Experiment 2:
Metric: {'Precision': 1.9, 'Recall': 0.91, 'Coverage': 16.29, 'Popularity': 4.621031}
Func worker, run time: 6.969089031219482
Func splitData, run time: 1.9319438934326172
Experiment 3:
Metric: {'Precision': 1.74, 'Recall': 0.84, 'Coverage': 16.63, 'Popularity': 4.623422}
Func worker, run time: 7.836109161376953
Func splitData, run time: 1.8041598796844482
Experiment 4:
Metric: {'Precision': 1.79, 'Recall': 0.86, 'Coverage': 16.31, 'Popularity': 4.619975}
Func worker, run time: 9.520605087280273
Func splitData, run time: 1.8084988594055176
Experiment 5:
Metric: {'Precision': 1.84, 'Recall': 0.89, 'Coverage': 16.47, 'Popularity': 4.638201}
Func worker, run time: 6.670089960098267
Func splitData, run time: 1.8033149242401123
Experiment 6:
Metric: {'Precision': 1.82, 'Recall': 0.88, 'Coverage': 17.13, 'Popularity': 4.607126}
Func worker, run time: 6.848791122436523
Func splitData, run time: 1.8024988174438477
Experiment 7:
Metric: {'Precision': 1.84, 'Recall': 0.88, 'Coverage': 16.64, 'Popularity': 4.607283}
Func worker, run time: 6.614134073257446
Average Result (M=8, N=10, K=10): {'Precision': 1.79875, 'Recall': 0.865, 'Coverage': 16.67625, 'Popularity': 4.61806575}
Func run, run time: 76.9517409801483