In [2]:
import random 
import math 
import time 
from tqdm import tqdm 

from collections import defaultdict

# 定义通用函数

In [3]:
# 定义装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print("Func {:s}, run time: {:f}".format(func.__name__, stop_time-start_time))
        return res 
    return wrapper

# 数据处理相关的函数

## 观察数据

In [6]:
with open("data/hetrec2011-delicious-2k/user_taggedbookmarks.dat", 'r') as f: 
    for i, line in enumerate(f):
        print(line)
        if i > 2:
            break

userID	bookmarkID	tagID	day	month	year	hour	minute	second

8	1	1	8	11	2010	23	29	22

8	2	1	8	11	2010	23	25	59

8	7	1	8	11	2010	18	55	1



- 只需要前3列`userID`, `bookmarkID`, `tagID`

In [7]:
# 读取数据
class Dataset:
    def __init__(self, fp):
        self.data = self.loadData(fp)
    
    @timmer 
    def loadData(self, fp):
        data = [f.strip().split('\t')[:3] for f in open(fp, 'r').readlines()[1:]]
        new_data = {}
        for user, item, tag in data: 
            if user not in new_data:
                new_data[user] = {}
            if item not in new_data[user]:
                new_data[user][item] = set()
            # 保存用户u对物品i添加的标签tag的集合
            new_data[user][item].add(tag)
        ret = []
        for user in new_data:
            for item in new_data[user]:
                # 得到三元组
                ret.append((user, item, list(new_data[user][item])))
        return ret 
    
    @timmer 
    def splitData(self, M, k, seed=2019):
        train, test = [], []
        random.seed(seed)
        for user, item, tags in self.data:
            if random.randint(0, M-1) == k: 
                test.append((user, item, tags))
            else:
                train.append((user, item, tags))
        
        # 处理成字典的形式
        def convert_dict(data):
            data_dict = {}
            for user, item, tags in data: 
                if user not in data_dict:
                    data_dict[user] = {}
                data_dict[user][item] = tags
            return data_dict
        return convert_dict(train), convert_dict(test)

# 评价指标

In [34]:
class Metric:
    def __init__(self, train, test, GetRecommendation):
        self.train = train 
        self.test = test 
        self.GetRecommendation = GetRecommendation
        self.recs = self.getRec()
        
    # 为test中每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs
    
    # 定义精确率指标
    def precision(self):
        all, hit = 0, 0 
        for user in self.test: 
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank: 
                if item in test_items:
                    hit += 1 
            all += len(rank)
        return round(hit/all*100, 2)
    
    # 定义召回率指标计算方法
    def recall(self):
        all, hit = 0, 0 
        for user in self.test: 
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank: 
                if item in test_items: 
                    hit += 1 
            all += len(test_items)
        return round(hit/all * 100, 2)
    
    # 定义覆盖率指标计算方式
    def coverage(self):
        all_item, recom_item = set(), set() 
        for user in self.train: 
            for item in self.train[user]:
                all_item.add(item)
        
        for user in self.test:
            rank = self.recs[user]
            for item, score in rank: 
                recom_item.add(item)
        
        return round(len(recom_item)/len(all_item)*100, 2)
    
    # 定义多样性指标计算方式
    def diversity(self):
        # 计算item_vec，每个tag的个数
        item_tags = {}
        for user in self.train:
            for item in self.train[user]:
                if item not in item_tags:
                    item_tags[item] = {} 
                for tag in self.train[user][item]:
                    if tag not in item_tags[item]:
                        item_tags[item][tag] = 0 
                    item_tags[item][tag] += 1 
        # 计算两个item的相似度
        def CosineSim(u, v):
            ret = 0 
            for tag in item_tags[u]:
                if tag in item_tags[v]:
                    ret += item_tags[u][tag] * item_tags[v][tag]
            nu, nv = 0, 0 
            for tag in item_tags[u]:
                nu += item_tags[u][tag] ** 2 
            for tag in item_tags[v]:
                nv += item_tags[v][tag] ** 2 
            return ret / math.sqrt(nu*nv)
        
        # 计算Diversity
        div = []
        for user in self.test: 
            rank = self.recs[user]
            sim, cnt = 0, 0 
            for u, _ in rank: 
                for v, _ in rank: 
                    if u == v: 
                        continue
                    sim += CosineSim(u, v)
                    cnt += 1 
            sim = sim /cnt if sim !=0 else 0 
            div.append(1 - sim)
        
        return sum(div) / len(div)
    
    # 定义新颖度指标计算方式
    def popularity(self):
        ## 计算物品的流行度，为给这个物品打过标签的用户数
        item_pop = {}
        for user in self.train: 
            for item in self.train[user]:
                if item not in item_pop:
                    item_pop[item] = 0 
                item_pop[item] += 1 
        
        num, pop = 0, 0 
        for user in self.test: 
            rank = self.recs[user]
            for item, score in rank: 
                ## 取对数，防止因长尾问题带来的被流行物品所主导的
                pop += math.log(1 + item_pop[item])
                num += 1 
        return round(pop/num, 4)
    
    def eval(self):
        metric = {"Precision": self.precision(), 
                 "Recall": self.recall(), 
                 "Coverage": self.coverage(), 
                 "Diversity": self.diversity(), 
                 "Popularity": self.popularity()}
        print("Metric: ", metric)
        return metric

# 算法实现

- SimpleTagBased
- TagBasedTFIDF
- TagBasedTFIDF++
- TagExtend

In [35]:
# 1.基于热门标签的推荐
def SimpleTagBased(train, N):
    # 统计user_tags, tag_items
    ## user_tag表示用户打过的tags的数量
    ## tag_items表示某个物品被打上tag的次数
    user_tags, tags_items = {}, {} 
    for user in train: 
        user_tags[user] = {}
        for item in train[user]:
            for tag in train[user][item]:
                user_tags[user][tag] = user_tags[user].get(tag, 0) + 1 
            
            if tag not in tags_items:
                tags_items[tag] = {}
            tags_items[tag][item] = tags_items[tag].get(item, 0) + 1 
    
    
    def GetRecommendation(user):
        # 按照打分推荐N个未见过的
        if user not in user_tags:
            return []
        seen_items = set(train[user])
        item_score = {}
        for tag in user_tags[user]:
            if tag in tags_items:
                for item in tags_items[tag]:
                    if item in seen_items:
                        continue
                    if item not in item_score:
                        item_score[item] = 0 
                    item_score[item] += user_tags[user][tag] * tags_items[tag][item]
        item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))
        return item_score[:N]
    
    return GetRecommendation
                    

In [36]:
# 2. 改进一：为热门标签加入惩罚项
def TagBasedTFIDF(train, N):
    user_tags, tag_items = {}, {} 
    # 统计打过此标签的用户数
    tag_pop = defaultdict(set)
    for user in train:
        user_tags[user] = {} 
        for item in train[user]:
            for tag in train[user][item]:
                if tag not in user_tags[user]:
                    user_tags[user][tag] = 0 
                user_tags[user][tag] += 1 
                if tag not in tag_items:
                    tag_items[tag] = {}
                tag_items[tag][item] = tag_items[tag].get(item, 0) + 1 
                # 将该用户添加到该tag下
                tag_pop[tag].add(user)
    tag_pop = {k: len(v) for k, v in tag_pop.items()}
    
    def GetRecommendation(user):
        if user not in user_tags:
            return []
        
        seen_items = set(train[user])
        item_score = {}
        for tag in user_tags[user]:
            for item in tag_items[tag]:
                if item in seen_items:
                    continue
                if item not in item_score:
                    item_score[item] = 0 
                item_score[item] += user_tags[user][tag] * tag_items[tag][item] / math.log(1+tag_pop[tag])
        item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))
        return item_score[:N]
    
    return GetRecommendation

In [56]:
# 3.改进二：同时对热门商品加入惩罚项
def TagBasedTFIDF_Improved(train, N):
    # 统计user_tags和tag_items
    user_tags, tag_items = {}, {}
    # 统计标签和物品的热门程度
    tag_pop, item_pop = {}, {} 
    for user in train: 
        user_tags[user] = {}
        for item in train[user]:
            item_pop[item] = item_pop.get(item, 0) + 1 
            for tag in train[user][item]:
                user_tags[user][tag] = user_tags[user].get(tag, 0) + 1 
                if tag not in tag_items:
                    tag_items[tag] = {} 
                tag_items[tag][item] = tag_items[tag].get(item, 0) + 1 
                if tag not in tag_pop:
                    tag_pop[tag] = set() 
                tag_pop[tag].add(user)
    tag_pop = {k: len(v) for k, v in tag_pop.items()}
    
    def GetRecommendation(user):
        if user not in user_tags:
            return []
        seen_items = set(train[user])
        item_score = {}
        for tag in user_tags[user]:
            for item in tag_items[tag]:
                if item in seen_items:
                    continue
                if item not in item_score:
                    item_score[item] = 0 
                item_score[item] += (user_tags[user][tag] * tag_items[tag][item]) / (math.log(1 + tag_pop[tag])+math.log(1+item_pop[item]))
        item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))
        return item_score[:N]
    return GetRecommendation

In [54]:
# 4. 基于标签改进的推荐
def ExpandTagBased(train, N, M=20):
    # 计算标签之间的相似度
    ## 如果标签出现在同一商品的描述中，说明比较相似
    item_tag = defaultdict(set)
    for user in train: 
        for item in train[user]:
            for tag in train[user][item]:
                item_tag[item].add(tag)
    tag_sim, tag_cnt = {}, {}
    for item in item_tag:
        for u in item_tag[item]:
            tag_cnt[u] = tag_cnt.get(u, 0) + 1 
        
        if u not in tag_sim:
            tag_sim[u] = {}
        for v in item_tag[item]:
            if u == v: 
                continue
            if v not in tag_sim[u]:
                tag_sim[u][v] = 0 
            tag_sim[u][v] += 1 
    for u in tag_sim: 
        for v in tag_sim[u]:
            tag_sim[u][v] /= math.sqrt(tag_cnt[u] * tag_cnt[v])
    
    # 为每个用户扩展标签
    user_tags = {}
    for user in train: 
        if user not in user_tags:
            user_tags[user] = {}
        for item in train[user]:
            for tag in train[user][item]:
                user_tags[user][tag] = user_tags[user].get(tag, 0) + 1 
        expand_tags =  {}
        for user in user_tags:
            ## 如果大于指定数量，则直接添加
            if len(user_tags[user]) >= M: 
                expand_tags[user] = user_tags[user]
                continue
            ## 不满M个进行扩展
            expand_tags[user] = {}
            seen_tags = set(user_tags[user])
            for tag in user_tags[user]:
                if tag in tag_sim:
                    for t in tag_sim[tag]:
                        if t in seen_tags:
                            continue
                        if t not in expand_tags[user]:
                            expand_tags[user][t] = 0 
                        expand_tags[user][t] += user_tags[user][tag] * tag_sim[tag][t]
            expand_tags[user].update(user_tags[user])
            expand_tags[user] = dict(list(sorted(expand_tags[user].items(), key=lambda x: x[1], reverse=True)[:M]))
    
    ## SimpleTagBased算法
    tag_items = {}
    for user in train: 
        for item in train[user]:
            for tag in train[user][item]:
                if tag not in tag_items:
                    tag_items[tag] = {}
                if item not in tag_items[tag]:
                    tag_items[tag][item] = 0 
                tag_items[tag][item] += 1 
    
    def GetRecommendation(user):
        if user not in user_tags:
            return []
        seen_items = set(train[user])
        item_score = {}
        for tag in expand_tags[user]:
            for item in tag_items[tag]:
                if item in seen_items:
                    continue
                if item not in item_score:
                    item_score[item] = 0 
                item_score[item] += expand_tags[user][tag] * tag_items[tag][item]
        item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))[:N]
        return item_score
    
    return GetRecommendation

# 实验

In [39]:
class Experiment:
    def __init__(self, M, N, fp="data/hetrec2011-delicious-2k/user_taggedbookmarks.dat", rt='SimpleTagBased'):
        self.M = M 
        self.N = N 
        self.fp = fp
        self.rt = rt 
        self.alg = {"SimpleTagBased": SimpleTagBased, "TagBasedTFIDF": TagBasedTFIDF,
                   "TagBasedTFIDF_Improved": TagBasedTFIDF_Improved, "ExpandTagBased": ExpandTagBased}
        
    
    # 定义单次实验
    @timmer
    def worker(self,train, test):
        getRecommendation = self.alg[self.rt](train, self.N)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()
    
    # 多次实验取平均
    @timmer 
    def run(self):
        metrics = {"Precision": 0, "Recall": 0,
                  "Coverage": 0, "Diversity": 0, 
                  "Popularity": 0}
        dataset = Dataset(self.fp)
        for ii in range(self.M):
            train, test = dataset.splitData(self.M, ii)
            print(f"Experiment {ii}: ")
            metric = self.worker(train, test)
            metrics = {k:metrics[k]+metric[k] for k in metrics}
        metrics = {k: metrics[k] / self.M for k in metrics}
        print("Average Result (M={}, N={}): {}".format(self.M, self.N, metrics))

## SimpleTagBased实验

In [40]:
M, N = 10, 10 
exp = Experiment(M, N, rt="SimpleTagBased")
exp.run()

Func loadData, run time: 0.923646
Func splitData, run time: 0.217856
Experiment 0: 
Metric:  {'Precision': 0.31, 'Recall': 0.52, 'Coverage': 5.4, 'Diversity': 0.8002490787673343, 'Popularity': 1.9951}
Func worker, run time: 6.545003
Func splitData, run time: 0.115637
Experiment 1: 
Metric:  {'Precision': 0.35, 'Recall': 0.58, 'Coverage': 5.41, 'Diversity': 0.8049237219810359, 'Popularity': 1.9627}
Func worker, run time: 6.621846
Func splitData, run time: 0.211591
Experiment 2: 
Metric:  {'Precision': 0.3, 'Recall': 0.5, 'Coverage': 5.42, 'Diversity': 0.8044458131615932, 'Popularity': 1.9978}
Func worker, run time: 6.552932
Func splitData, run time: 0.215611
Experiment 3: 
Metric:  {'Precision': 0.2, 'Recall': 0.33, 'Coverage': 5.43, 'Diversity': 0.8079008776595029, 'Popularity': 2.0065}
Func worker, run time: 6.469662
Func splitData, run time: 0.222466
Experiment 4: 
Metric:  {'Precision': 0.2, 'Recall': 0.32, 'Coverage': 5.37, 'Diversity': 0.8018008174713677, 'Popularity': 1.9955}
Fun

## TagBasedTFIDF实验

In [41]:
exp = Experiment(M, N, rt="TagBasedTFIDF")
exp.run()

Func loadData, run time: 0.924418
Func splitData, run time: 0.117036
Experiment 0: 
Metric:  {'Precision': 0.38, 'Recall': 0.63, 'Coverage': 5.02, 'Diversity': 0.7980656686930976, 'Popularity': 2.2352}
Func worker, run time: 36.691683
Func splitData, run time: 0.110977
Experiment 1: 
Metric:  {'Precision': 0.44, 'Recall': 0.71, 'Coverage': 5.09, 'Diversity': 0.7993524371550923, 'Popularity': 2.1797}
Func worker, run time: 36.123402
Func splitData, run time: 0.110357
Experiment 2: 
Metric:  {'Precision': 0.46, 'Recall': 0.75, 'Coverage': 5.13, 'Diversity': 0.8010450171102075, 'Popularity': 2.2179}
Func worker, run time: 35.986298
Func splitData, run time: 0.115441
Experiment 3: 
Metric:  {'Precision': 0.38, 'Recall': 0.64, 'Coverage': 5.13, 'Diversity': 0.8083701999821394, 'Popularity': 2.2439}
Func worker, run time: 36.888173
Func splitData, run time: 0.124253
Experiment 4: 
Metric:  {'Precision': 0.35, 'Recall': 0.57, 'Coverage': 5.08, 'Diversity': 0.8011449840521201, 'Popularity': 2.

## TagBasedTFIDF++

In [57]:
exp = Experiment(M, N, rt="TagBasedTFIDF_Improved")
exp.run()

Func loadData, run time: 1.003245
Func splitData, run time: 0.114796
Experiment 0: 
Metric:  {'Precision': 0.36, 'Recall': 0.59, 'Coverage': 5.2, 'Diversity': 0.7738559878850848, 'Popularity': 2.1389}
Func worker, run time: 47.252694
Func splitData, run time: 0.204991
Experiment 1: 
Metric:  {'Precision': 0.41, 'Recall': 0.66, 'Coverage': 5.33, 'Diversity': 0.7763989991514378, 'Popularity': 2.0848}
Func worker, run time: 48.905256
Func splitData, run time: 0.220017
Experiment 2: 
Metric:  {'Precision': 0.44, 'Recall': 0.72, 'Coverage': 5.34, 'Diversity': 0.7782135429512786, 'Popularity': 2.1248}
Func worker, run time: 49.361965
Func splitData, run time: 0.253507
Experiment 3: 
Metric:  {'Precision': 0.35, 'Recall': 0.59, 'Coverage': 5.31, 'Diversity': 0.7856628904294432, 'Popularity': 2.1428}
Func worker, run time: 51.712393
Func splitData, run time: 0.237244
Experiment 4: 
Metric:  {'Precision': 0.33, 'Recall': 0.53, 'Coverage': 5.32, 'Diversity': 0.7774922125492686, 'Popularity': 2.1

## ExpandaTagBased

In [55]:
exp = Experiment(M, N, rt="ExpandTagBased")
exp.run()

Func loadData, run time: 1.020065
Func splitData, run time: 0.226482
Experiment 0: 
Metric:  {'Precision': 0.34, 'Recall': 0.57, 'Coverage': 3.35, 'Diversity': 0.7871941453011421, 'Popularity': 2.3512}
Func worker, run time: 125.082337
Func splitData, run time: 0.210525
Experiment 1: 
Metric:  {'Precision': 0.41, 'Recall': 0.67, 'Coverage': 3.36, 'Diversity': 0.7879216593690705, 'Popularity': 2.2947}
Func worker, run time: 114.393990
Func splitData, run time: 0.223748
Experiment 2: 
Metric:  {'Precision': 0.44, 'Recall': 0.72, 'Coverage': 3.44, 'Diversity': 0.7910547861498147, 'Popularity': 2.3311}
Func worker, run time: 128.503712
Func splitData, run time: 0.206493
Experiment 3: 
Metric:  {'Precision': 0.31, 'Recall': 0.52, 'Coverage': 3.38, 'Diversity': 0.7976921150103528, 'Popularity': 2.3608}
Func worker, run time: 128.812517
Func splitData, run time: 0.110861
Experiment 4: 
Metric:  {'Precision': 0.3, 'Recall': 0.49, 'Coverage': 3.35, 'Diversity': 0.790169428364491, 'Popularity': 