In [5]:
# 导入相关的包
import random
import math 
import numpy as np 
import time 
from tqdm.autonotebook import tqdm, trange

from collections import defaultdict

# 通用函数

In [2]:
# 定义时间装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print("Func {:s}, run time: {:f}".format(func.__name__, stop_time - start_time))
        return res 
    return wrapper

# 数据处理相关函数

In [3]:
# 数据集的读取
class Dataset:
    def __init__(self, fp):
        self.data = self.loadData(fp)
    
    @timmer
    def loadData(self, fp):
        data = []
        # 只取用户id和电影id
        for l in open(fp):
            data.append(tuple(map(int, l.strip().split("::")[:2])))
        return data 
    
    @timmer
    def splitData(self, M, k, seed=1):
        '''
        data: 加载所有(user_id, movie_id)def数据条目
        M: 划分的数目，最后去M折的平均
        k: 本次是第几次划分，k~[0,M)
        seed: 随机种子数，对于不同k应该设置为相同值
        return train, test
        '''
        train, test = [], [] 
        random.seed(seed)
        for user, item in self.data:
            if random.randint(0, M-1) == k:
                test.append((user, item))
            else:
                train.append((user, item))
        
        ## 处理成字典形式
        def convert_dict(data):
            data_dict = defaultdict(set)
            for user, item in data: 
                data_dict[user].add(item)
            data_dict = {k: list(data_dict[k]) for k in data_dict}
            return data_dict 
        
        return convert_dict(train), convert_dict(test)

# Metirc函数

In [4]:
class Metric:
    
    def __init__(self, train, test, GetRecommendation):
        '''
        GetRecommendation: 为某个用户推荐物品的接口函数，返回值为电影id组成的list
        '''
        self.train = train
        self.test = test 
        self.GetRecommendation = GetRecommendation
        # 保存为测试集用户推荐结果的值
        self.recs = self.getRec()
        
    # 为test中的每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs
    
    ## 定义精确率指标
    def precision(self):
        all, hit = 0, 0 
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1 
            all += len(rank)
        return round(hit / all * 100, 2)
    
    ## 定义召回率指标
    def recall(self):
        all, hit = 0, 0 
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank: 
                if item in test_items:
                    hit += 1 
            all += len(test_items)
        return round(hit / all * 100, 2)
    
    ## 定义覆盖率指标
    def coverage(self):
        all_item, recom_item = set(), set()
        for user in self.test:
            for item in self.train[user]:
                all_item.add(item)
            rank = self.recs[user]
            for item, score in rank: 
                recom_item.add(item)
                
        return round(len(recom_item) / len(all_item) * 100, 2)
    
    ## 定义新颖度指标
    def popularity(self):
        ## 计算推荐物品的平均流行度
        item_pop = {}
        for user in self.train: 
            for item in self.train[user]:
                item_pop[item] = item_pop.get(item, 0) + 1 
        num, pop = 0, 0 
        for user in self.test: 
            rank = self.recs[user]
            for item, score in rank:
                # 取对数，防止长尾问题带来被流行物品主导的问题
                pop += math.log(1+item_pop[item])
                num += 1 
        return round(pop / num, 6)
    
    def eval(self):
        metric = {
            "Precision": self.precision(),
            "Recall": self.recall(),
            "Coverage": self.coverage(),
            "Popularity": self.popularity()
        }
        print("Metric:  ", metric)
        return metric

# LFM算法实现

In [20]:
def LFM(train, ratio, K, lr, step, lmbda, N):
    '''
    train: 训练数据　
    ratio: 负采样的 负/正的比例
    K: 隐语义的个数　
    lr: 初始学习率
    step: 迭代次数
    lmbda: 正则化系数　
    N: 推荐TpoN的物品数目
    '''
    
    all_items = {}
    for user in train:
        for item in train[user]:
            # 计算所有商品的出现的次数
            all_items[item] = all_items.get(item, 0) + 1 
            
    all_items = list(all_items.items())
    items = [x[0] for x in all_items]
    pops = [x[1] for x in all_items]
    
    # 负采样，按照流行度采样
    def nSample(data, ratio):
        new_data = defaultdict(dict)
        # 正样本
        for user in data:
            for item in data[user]:
                new_data[user][item] = new_data[user].get(item, 0) + 1 
        
        # 负样本
        for user in new_data:
            seen = set(new_data[user])
            ## 记录已经看过的电影的数量
            pos_num = len(seen)
            ## 第一个参数表示候选集合，第二个参数表示选出来的集合大小，第三个参数表示每一个元素被选择的可能性
            item = np.random.choice(items, int(pos_num*ratio*3), pops)
            item = [x for x in item if x not in seen][:int(pos_num*ratio)]
            new_data[user].update({x: 0 for x in item})
        return new_data
    
    # 训练　
    P, Q = {}, {}
    ## 对每个用户生成隐状态的概率分布
    for user in train: 
        P[user] = np.random.random(K)
    ## 对每个产品生成对应的隐状态概率分布
    for item in items:
        Q[item] = np.random.random(K)
    
    for s in trange(step):
        data = nSample(train, ratio)
        for user in data: 
            ## 随机梯度下降
            for item in data[user]:
                ## 计算误差
                eui = data[user][item] - (P[user]*Q[item]).sum()
                P[user] += lr * (Q[item]*eui - lmbda * P[user])
                Q[item] += lr * (P[user]*eui - lmbda * Q[item])
        # 调整学习率
        lr *= 0.9 
    
    # 获取接口函数
    def GetRecommendation(user):
        seen_items = set(train[user])
        recs = {}
        ## 对于所有的商品
        for item in items:
            if item not in seen_items:
                recs[item] = (P[user]*Q[item]).sum()
        recs = list(sorted(recs.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs
    
    return GetRecommendation
        

# LFM实验

- M=8, N=10, ratio=[1,2,3,5,10,20]

In [22]:
class Experiment:
    def __init__(self, M, N, ratio=1, K=100, lr=0.02, step=100, lmbda=0.01, fp="data/ml-1m/ratings.dat"):
        '''
        M: 表示进行多少次实验 
        N: 表示TopN推荐商品数
        ratio: 负正样本比
        K: 隐语义个数
        lr: 学习率　
        step: 训练步数　
        lmbda: 正则化系数
        '''
        self.M = M 
        self.K = K 
        self.N = N
        self.ratio = ratio 
        self.lr = lr 
        self.step = step 
        self.lmbda = lmbda
        self.fp = fp 
        self.alg = LFM
        
    # 定义单次实验
    @timmer
    def worker(self, train, test):
        getRecommendation = self.alg(train, self.ratio, self.K, self.lr, self.step, self.lmbda, self.N)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()
    
    #　定义多次实验取平均
    def run(self):
        metrics = {"Precision": 0, "Recall": 0, 
                  "Coverage": 0, "Popularity": 0}
        dataset = Dataset(self.fp)
        for ii in range(self.M):
            train, test = dataset.splitData(self.M, ii)
            print(f"Experiment {ii}")
            metric = self.worker(train, test)
            metrics = {k: metrics[k] + metric[k] for k in metrics}
        metrics = {k: metrics[k] / self.M for k in metrics}
        print("Average Result (M={}, N+{}, ratio={}): {}".format(self.M, self.N, self.ratio, metrics))
        

# 实验

In [None]:
M, N = 8, 10 
for r in [1,2,3,5,10,20]:
    print(f"Ratio={r}")
    exp = Experiment(M, N, ratio=r)
    exp.run()

Ratio=1
Func loadData, run time: 0.792942
Func splitData, run time: 1.142637
Experiment 0


HBox(children=(IntProgress(value=0), HTML(value='')))

Metric:   {'Precision': 19.04, 'Recall': 9.15, 'Coverage': 88.08, 'Popularity': 6.303289}
Func worker, run time: 2139.128076
Func splitData, run time: 1.877735
Experiment 1


HBox(children=(IntProgress(value=0), HTML(value='')))

Metric:   {'Precision': 18.77, 'Recall': 8.99, 'Coverage': 89.32, 'Popularity': 6.287361}
Func worker, run time: 2125.120253
Func splitData, run time: 1.336442
Experiment 2


HBox(children=(IntProgress(value=0), HTML(value='')))