In [1]:
import random 
import math 
import time 
from tqdm import tqdm 

from collections import defaultdict

# 通用函数定义

In [2]:
# 定义装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print("Func {:s}, run time: {:f}".format(func.__name__, stop_time-start_time))
        return res 
    return wrapper

# 数据处理相关函数

In [3]:
class Dataset:
    def __init__(self, fp):
        self.data = self.loadData(fp)
    
    @timmer
    def loadData(self, fp):
        data = [f.strip().split('\t')[:3] for f in open(fp, 'r').readlines()[1:]]
        return data 
    
    @timmer
    def splitData(self, M, k, seed=2019):
        train, test = [], []
        random.seed(seed)
        
        for user, item, tag in self.data: 
            if random.randint(0, M-1) == k:
                test.append((user, item, tag))
            else:
                train.append((user, item, tag))
        # 转换成字典形式, user->set(items)
        def convert_dict(data):
            data_dict = {}
            for user, item, tag in data: 
                if user not in data_dict:
                    data_dict[user] = {}
                if item not in data_dict[user]:
                    data_dict[user][item] = set()
                data_dict[user][item].add(tag)
            for user in data_dict:
                for item in data_dict[user]:
                    data_dict[user][item] = list(data_dict[user][item])
            return data_dict
        
        return convert_dict(train), convert_dict(test)

# 评价指标

In [4]:
class Metric:
    def __init__(self, train, test, GetRecommendation):
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.recs = self.getRec()
        
    # 为test中的每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            recs[user] = {}
            for item in self.test[user]:
                rank = self.GetRecommendation(user, item)
                recs[user][item] = rank
        return recs
    
    # 定义精确率的指标
    def precision(self):
        all, hit = 0, 0 
        for user in self.test:
            for item in self.test[user]:
                test_tags = set(self.test[user][item])
                rank = self.recs[user][item]
                for tag, score in rank:
                    if tag  in test_tags:
                        hit += 1 
                all += len(rank)
        return round(hit/all*100, 2)
    
    # 定义召回率指标计算方式
    def recall(self):
        all, hit = 0, 0 
        for user in self.test: 
            for item in self.test[user]:
                test_tags = set(self.test[user][item])
                rank = self.recs[user][item]
                for tag, score in rank: 
                    if tag in test_tags:
                        hit += 1 
                all += len(test_tags)
        return round(hit / all * 100, 2)
    
    def eval(self):
        metric = {"Precision": self.precision(), 
                 "Recall": self.recall()}
        
        print("Metric: ", metric)
        return metric

# 算法实现

- Popular
- UserPopular
- ItemPopular
- HybirdPopular

In [13]:
# 热门标签
def Popular(train, N):
    tags = {}
    for user in train:
        for item in train[user]:
            for tag in train[user][item]:
                tags[tag] = tags.get(tag, 0) + 1 
    
    tags = list(sorted(tags.items(), key=lambda x: x[1], reverse=True))[:N]
    
    def GetRecommendation(user, item):
        return tags
    
    return GetRecommendation

In [6]:
# 推荐用户最热门的标签
def UserPopular(train, N):
    user_tags = {}
    for user in train: 
        user_tags[user] = {}
        for item in train[user]:
            for tag in train[user][item]:
                user_tags[user][tag] = user_tags[user].get(tag, 0) + 1 
    
    user_tags = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) for k, v in user_tags.items()}
    
    def GetRecommendation(user, item):
        if user in user_tags:
            return user_tags[user][:N]
        else:
            get = Popular(train, N)
            return get(user, item)
        
    return GetRecommendation
        

In [8]:
# 推荐物品最热门标签
def ItemPopular(train, N):
    # 统计Item_tags
    item_tags = {}
    for user in train: 
        for item in train[user]:
            if item not in item_tags:
                item_tags[item] = {}
            for tag in train[user][item]:
                item_tags[item][tag] = item_tags[item].get(tag, 0) + 1 
    item_tags = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) for k, v in item_tags.items()}
    
    def GetRecommendation(user, item):
        if item in item_tags:
            return item_tags[item][:N]
        else:
            get = Popular(train, N)
            return get(user, item)
    
    return GetRecommendation

In [20]:
# 联合用户和商品热门推荐
def HybridPopular(train, N, alpha):
    # 统计user_tags
    user_tags = {}
    for user in train:
        user_tags[user] = {}
        for item in train[user]:
            for tag in train[user][item]:
                user_tags[user][tag] = user_tags[user].get(tag, 0) + 1 
    
    # 统计item_tags
    item_tags = {}
    for user in train:
        for item in train[user]:
            if item not in item_tags:
                item_tags[item] = {}
                for tag in train[user][item]:
                    item_tags[item][tag] = item_tags[item].get(item, 0) + 1 
    
    def GetRecommendation(user, item):
        tag_score = {}
        if user in user_tags:
            max_user_tag = max(user_tags[user].values())
            for tag in user_tags[user]:
                if tag not in tag_score:
                    tag_score[tag] = 0 
                tag_score[tag] += (1-alpha) * user_tags[user][tag] / max_user_tag
        
        if item in item_tags:
            max_item_tag = max(item_tags[item].values())
            for tag in item_tags[item]:
                if tag not in tag_score:
                    tag_score[tag] = 0 
                tag_score[tag] += alpha * item_tags[item][tag] / max_item_tag
        return list(sorted(tag_score.items(), key=lambda x: x[1], reverse=True))[:N]
    
    return GetRecommendation

# 实验

In [11]:
class Experiment:
    def __init__(self, M, N, fp="data/hetrec2011-delicious-2k/user_taggedbookmarks.dat", rt="Popular"):
        self.M = M
        self.N = N
        self.fp = fp 
        self.rt = rt
        self.alg = {"Popular": Popular, "UserPopular": UserPopular,
                   "ItemPopular": ItemPopular, "HybridPopular": HybridPopular}
        
        
    #定义单次实验
    @timmer
    def worker(self, train, test, **kwargs):
        getRecommendation = self.alg[self.rt](train, self.N, **kwargs)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()
    
    # 定义多次实验
    @timmer
    def run(self, **kwargs):
        metrics = {"Precision": 0, "Recall": 0}
        dataset = Dataset(self.fp)
        for ii in range(self.M):
            train, test = dataset.splitData(self.M, ii)
            print(f"Experiment: {ii}")
            metric = self.worker(train, test, **kwargs)
            metrics = {k: metrics[k]+metric[k] for k in metrics}
        
        metrics = {k: metrics[k] / self.M for k in metrics}
        print("Average Result (M={}, N={}): {}".format(self.M, self.N, metrics))

## Popular实验

In [14]:
M, N = 10, 10 
exp = Experiment(M, N, rt="Popular")
exp.run()

Func loadData, run time: 0.471173
Func splitData, run time: 0.718057
Experiment: 0
Metric:  {'Precision': 0.82, 'Recall': 6.37}
Func worker, run time: 0.127186
Func splitData, run time: 0.542508
Experiment: 1
Metric:  {'Precision': 0.85, 'Recall': 6.67}
Func worker, run time: 0.248760
Func splitData, run time: 0.555226
Experiment: 2
Metric:  {'Precision': 0.82, 'Recall': 6.4}
Func worker, run time: 0.117698
Func splitData, run time: 0.706522
Experiment: 3
Metric:  {'Precision': 0.86, 'Recall': 6.76}
Func worker, run time: 0.116232
Func splitData, run time: 0.697612
Experiment: 4
Metric:  {'Precision': 0.83, 'Recall': 6.46}
Func worker, run time: 0.119412
Func splitData, run time: 0.546528
Experiment: 5
Metric:  {'Precision': 0.86, 'Recall': 6.73}
Func worker, run time: 0.123459
Func splitData, run time: 0.690699
Experiment: 6
Metric:  {'Precision': 0.85, 'Recall': 6.65}
Func worker, run time: 0.118489
Func splitData, run time: 0.700311
Experiment: 7
Metric:  {'Precision': 0.86, 'Recall

## UserPopular实验

In [15]:
exp = Experiment(M, N, rt="UserPopular")
exp.run()

Func loadData, run time: 0.446649
Func splitData, run time: 0.729339
Experiment: 0
Metric:  {'Precision': 3.14, 'Recall': 24.41}
Func worker, run time: 0.179784
Func splitData, run time: 0.772570
Experiment: 1
Metric:  {'Precision': 3.2, 'Recall': 24.95}
Func worker, run time: 0.318386
Func splitData, run time: 0.803341
Experiment: 2
Metric:  {'Precision': 3.15, 'Recall': 24.53}
Func worker, run time: 0.242467
Func splitData, run time: 0.549572
Experiment: 3
Metric:  {'Precision': 3.17, 'Recall': 24.73}
Func worker, run time: 0.525570
Func splitData, run time: 0.548713
Experiment: 4
Metric:  {'Precision': 3.13, 'Recall': 24.38}
Func worker, run time: 0.589575
Func splitData, run time: 0.532585
Experiment: 5
Metric:  {'Precision': 3.23, 'Recall': 25.22}
Func worker, run time: 0.168817
Func splitData, run time: 0.678190
Experiment: 6
Metric:  {'Precision': 3.17, 'Recall': 24.64}
Func worker, run time: 0.230900
Func splitData, run time: 0.705462
Experiment: 7
Metric:  {'Precision': 3.18, 

## ItemPopular

In [16]:
exp = Experiment(M, N, rt="ItemPopular")
exp.run()

Func loadData, run time: 0.351101
Func splitData, run time: 0.692680
Experiment: 0
Metric:  {'Precision': 1.58, 'Recall': 7.98}
Func worker, run time: 55.226680
Func splitData, run time: 0.539450
Experiment: 1
Metric:  {'Precision': 1.59, 'Recall': 8.05}
Func worker, run time: 57.113130
Func splitData, run time: 0.698452
Experiment: 2
Metric:  {'Precision': 1.58, 'Recall': 8.0}
Func worker, run time: 59.773259
Func splitData, run time: 0.708363
Experiment: 3
Metric:  {'Precision': 1.58, 'Recall': 8.02}
Func worker, run time: 55.266559
Func splitData, run time: 0.710807
Experiment: 4
Metric:  {'Precision': 1.59, 'Recall': 8.02}
Func worker, run time: 50.985781
Func splitData, run time: 0.694144
Experiment: 5
Metric:  {'Precision': 1.59, 'Recall': 8.1}
Func worker, run time: 52.782524
Func splitData, run time: 0.702360
Experiment: 6
Metric:  {'Precision': 1.58, 'Recall': 7.97}
Func worker, run time: 49.121496
Func splitData, run time: 0.698344
Experiment: 7
Metric:  {'Precision': 1.58, '

## HybridPopular

In [21]:
for alpha in range(0, 11):
    alpha /= 10 
    print("alpha = ", alpha)
    exp = Experiment(M, N, rt="HybridPopular")
    exp.run(alpha=alpha)

alpha =  0.0
Func loadData, run time: 0.463718
Func splitData, run time: 0.720193
Experiment: 0
Metric:  {'Precision': 3.13, 'Recall': 24.41}
Func worker, run time: 2.046175
Func splitData, run time: 0.684545
Experiment: 1
Metric:  {'Precision': 3.2, 'Recall': 24.97}
Func worker, run time: 2.016257
Func splitData, run time: 0.698015
Experiment: 2
Metric:  {'Precision': 3.15, 'Recall': 24.53}
Func worker, run time: 1.996331
Func splitData, run time: 0.528147
Experiment: 3
Metric:  {'Precision': 3.16, 'Recall': 24.73}
Func worker, run time: 2.175389
Func splitData, run time: 0.532241
Experiment: 4
Metric:  {'Precision': 3.13, 'Recall': 24.39}
Func worker, run time: 2.194852
Func splitData, run time: 0.527457
Experiment: 5
Metric:  {'Precision': 3.23, 'Recall': 25.22}
Func worker, run time: 2.018466
Func splitData, run time: 0.670770
Experiment: 6
Metric:  {'Precision': 3.16, 'Recall': 24.65}
Func worker, run time: 2.035819
Func splitData, run time: 0.689310
Experiment: 7
Metric:  {'Preci

Metric:  {'Precision': 2.88, 'Recall': 22.4}
Func worker, run time: 2.258133
Func splitData, run time: 0.533791
Experiment: 9
Metric:  {'Precision': 2.86, 'Recall': 22.41}
Func worker, run time: 2.285165
Average Result (M=10, N=10): {'Precision': 2.888, 'Recall': 22.528}
Func run, run time: 28.693351
alpha =  0.6
Func loadData, run time: 0.347186
Func splitData, run time: 0.741132
Experiment: 0
Metric:  {'Precision': 2.87, 'Recall': 22.4}
Func worker, run time: 2.085892
Func splitData, run time: 0.689652
Experiment: 1
Metric:  {'Precision': 2.93, 'Recall': 22.9}
Func worker, run time: 2.100322
Func splitData, run time: 0.554241
Experiment: 2
Metric:  {'Precision': 2.88, 'Recall': 22.44}
Func worker, run time: 2.274111
Func splitData, run time: 0.560003
Experiment: 3
Metric:  {'Precision': 2.9, 'Recall': 22.65}
Func worker, run time: 2.290517
Func splitData, run time: 0.745495
Experiment: 4
Metric:  {'Precision': 2.81, 'Recall': 21.94}
Func worker, run time: 2.219900
Func splitData, run