**<center><font size=6> 基于用户的协同过滤 </font></center>**

In [17]:
import numpy as np 
import math 
import time 
from tqdm.autonotebook import tqdm
import random
from collections import defaultdict

# 定义通用函数

In [33]:
# 定义时间装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print("Func {:s}, run time: {:f}".format(func.__name__, stop_time - start_time))
        return res 
    return wrapper

# 数据处理相关

In [6]:
!ls data/ml-1m/

movies.dat  ratings.dat  README  users.dat


In [11]:
with open("data/ml-1m/ratings.dat") as f:
    for line in f: 
        print(line)
        break

1::1193::5::978300760



- 上面数据的含义为 `user_id`::`movie_id`::`rating`::`unix_timestamp`

这里的timestamp是自1970年1月1日开始计算

In [14]:
with open("data/ml-1m/movies.dat", encoding="latin-1") as f:
    for i, line in enumerate(f):      
        print(line)
        if i == 2:
            break

1::Toy Story (1995)::Animation|Children's|Comedy

2::Jumanji (1995)::Adventure|Children's|Fantasy

3::Grumpier Old Men (1995)::Comedy|Romance



- 上面的数据含义为 `movie_id`::`title`(`release_date`)::`tags`

其中`tags`是用`|`分割的，表示影片的类型

In [15]:
with open("data/ml-1m/users.dat", encoding="latin-1") as f:
    for i, line in enumerate(f):      
        print(line)
        if i == 2:
            break

1::F::1::10::48067

2::M::56::16::70072

3::M::25::15::55117



- 上面数据的含义为 `user_id`::`sex`::`age`::`ocupation`::`zip_code`

`zip_code`表示邮政编码

In [34]:
# 数据集的读取
class Dataset:
    def __init__(self, fp):
        self.data = self.loadData(fp)
    
    @timmer
    def loadData(self, fp):
        data = []
        # 只取用户id和电影id
        for l in open(fp):
            data.append(tuple(map(int, l.strip().split("::")[:2])))
        return data 
    
    @timmer
    def splitData(self, M, k, seed=1):
        '''
        data: 加载所有(user_id, movie_id)def数据条目
        M: 划分的数目，最后去M折的平均
        k: 本次是第几次划分，k~[0,M)
        seed: 随机种子数，对于不同k应该设置为相同值
        return train, test
        '''
        train, test = [], [] 
        random.seed(seed)
        for user, item in self.data:
            if random.randint(0, M-1) == k:
                test.append((user, item))
            else:
                train.append((user, item))
        
        ## 处理成字典形式
        def convert_dict(data):
            data_dict = defaultdict(set)
            for user, item in data: 
                data_dict[user].add(item)
            data_dict = {k: list(data_dict[k]) for k in data_dict}
            return data_dict 
        
        return convert_dict(train), convert_dict(test)

# 评价指标

- Precision
- Recall
- Coverage
- Popularity(Novelty)

In [53]:
class Metric:
    
    def __init__(self, train, test, GetRecommendation):
        '''
        GetRecommendation: 为某个用户推荐物品的接口函数，返回值为电影id组成的list
        '''
        self.train = train
        self.test = test 
        self.GetRecommendation = GetRecommendation
        # 保存为测试集用户推荐结果的值
        self.recs = self.getRec()
        
    # 为test中的每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs
    
    ## 定义精确率指标
    def precision(self):
        all, hit = 0, 0 
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1 
            all += len(rank)
        return round(hit / all * 100, 2)
    
    ## 定义召回率指标
    def recall(self):
        all, hit = 0, 0 
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank: 
                if item in test_items:
                    hit += 1 
            all += len(test_items)
        return round(hit / all * 100, 2)
    
    ## 定义覆盖率指标
    def coverage(self):
        all_item, recom_item = set(), set()
        for user in self.test:
            for item in self.train[user]:
                all_item.add(item)
            rank = self.recs[user]
            for item, score in rank: 
                recom_item.add(item)
                
        return round(len(recom_item) / len(all_item) * 100, 2)
    
    ## 定义新颖度指标
    def popularity(self):
        ## 计算推荐物品的平均流行度
        item_pop = {}
        for user in self.train: 
            for item in self.train[user]:
                item_pop[item] = item_pop.get(item, 0) + 1 
        num, pop = 0, 0 
        for user in self.test: 
            rank = self.recs[user]
            for item, score in rank:
                # 取对数，防止长尾问题带来被流行物品主导的问题
                pop += math.log(1+item_pop[item])
                num += 1 
        return round(pop / num, 6)
    
    def eval(self):
        metric = {
            "Precision": self.precision(),
            "Recall": self.recall(),
            "Coverage": self.coverage(),
            "Popularity": self.popularity()
        }
        print("Metric:  ", metric)
        return metric

# 算法实现

- Random
- MostPopular
- UserCF
- UserIIF

In [54]:
# 随机推荐
def Random(train, K, N):
    '''
    train: 训练数据集 
    K: 表示相似用户的人数　
    N: 表示TopN的数量
    return: GetRecommendation函数
    '''
    items = {} 
    # 保存所有的电影
    for user in train: 
        for item in train[user]:
            items[item] = 1 
    
    def GetRecommendation(user):
        # 随机推荐N个没有见过的
        user_items = set(train[user])
        rec_items = {k : items[k] for k in items if k not in user_items}
        rec_items = list(rec_items.items())
        random.shuffle(rec_items)
        return rec_items[:N]
    return GetRecommendation

In [61]:
# 选择没看过中最热门的推荐
def MostPopular(train, K, N):
    items = {}
    for user in train: 
        for item in train[user]:
            items[item] = items.get(item, 0) + 1 
    
    def GetRecommendation(user):
        # 推荐最热门的N部电影
        user_items = set(train[user])
        rec_items = {k: items[k] for k in items if k not in user_items}
        rec_items = list(sorted(rec_items.items(), key=lambda x: x[1], reverse=True))
        return rec_items[:N]
    
    return GetRecommendation

In [56]:
# 基于用户余弦相似度的推荐
def UserCF(train, K, N):
    # 计算item-user的倒排索引
    item_users = defaultdict(set)
    for user in train:
        for item in train[user]:
            item_users[item].add(user)
    
    # 计算用户相似度矩阵
    sim = {}
    num = {}
    for item in item_users:
        # 取出一个商品有行为的所有用户
        users = item_users[item]
        for i, u in enumerate(users):
            # 计算用户操作过物品的数量
            num[u] = num.get(u, 0) + 1 
            # 如果u没有在相似度矩阵出现过
            if u not in sim:
                sim[u] = {} 
            for j, v in enumerate(users):
                if j == i: continue
                sim[u][v] = sim[u].get(v, 0) + 1 
    for u in sim:
        for v in sim[u]:
            sim[u][v] /= math.sqrt(num[u]*num[v])
    
    # 按照相似度排序
    sorted_user_sim = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) for k, v in sim.items()}
    
    # 获取接口函数
    def GetRecommendation(user):
        items = {} 
        # 表示用户看过的电影
        seen_items = set(train[user])
        for u, _ in sorted_user_sim[user][:K]:
            for item in train[u]:
                # 去掉用户见过的
                if item not in seen_items:
                    ## 这样得到的一部电影对应的值为，与该用户相似并且看过看电影的相似度的和
                    items[item] = items.get(item, 0) + sim[user][u]
        recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs
    return GetRecommendation
                    

In [57]:
# 基于改进的余弦相似度的推荐
def UserIIF(train, K, N):
    # 计算item-user的倒排索引
    item_users = defaultdict(set)
    for user in train:
        for item in train[user]:
            item_users[item].add(user)
    
    # 计算用户相似度矩阵
    sim = {}
    num = {}
    for item in item_users:
        # 取出一个商品有行为的所有用户
        users = item_users[item]
        for i, u in enumerate(users):
            # 计算用户操作过物品的数量
            num[u] = num.get(u, 0) + 1 
            # 如果u没有在相似度矩阵出现过
            if u not in sim:
                sim[u] = {} 
            for j, v in enumerate(users):
                if j == i: continue
                ## 相比上面主要改进在这里
                sim[u][v] = sim[u].get(v, 0) + 1 / math.log(1 + len(users)) 
    for u in sim:
        for v in sim[u]:
            sim[u][v] /= math.sqrt(num[u]*num[v])
    
    # 按照相似度排序
    sorted_user_sim = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) for k, v in sim.items()}
    
    # 获取接口函数
    def GetRecommendation(user):
        items = {} 
        # 表示用户看过的电影
        seen_items = set(train[user])
        for u, _ in sorted_user_sim[user][:K]:
            for item in train[u]:
                # 去掉用户见过的
                if item not in seen_items:
                    ## 这样得到的一部电影对应的值为，与该用户相似并且看过看电影的相似度的和
                    items[item] = items.get(item, 0) + sim[user][u]
        recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs
    return GetRecommendation

# 实验

- Random实验
- MostPopular实验
- UserCF实验，K=[5, 10, 20, 40, 80, 160]
- UserIIF实验, K=80

In [58]:
class Experiment:
    def __init__(self, M, K, N, fp="data/ml-1m/ratings.dat", rt="UserCF"):
        self.M = M 
        self.K = K
        self.N = N 
        self.fp = fp 
        self.rt = rt 
        self.alg = {"Random": Random, "MostPopular": MostPopular, "UserCF": UserCF, "UserIIF": UserIIF}
        
    # 定义单次实验
    @timmer
    def worker(self, train, test):
        getRecommendation = self.alg[self.rt](train, self.K, self.N)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()
    
    # 多次实验取平均
    @timmer
    def run(self):
        metrics = {"Precision": 0, "Recall": 0,
                  "Coverage": 0, "Popularity": 0}
        dataset = Dataset(self.fp)
        for ii in range(self.M):
            train, test = dataset.splitData(self.M, ii)
            print(f"Experiment {ii}: ")
            metric = self.worker(train, test)
            metrics = {k: metrics[k]+metric[k] for k in metrics}
        
        metrics = {k: metrics[k] / self.M for k in metrics}
        
        print(f"Average Result (M={self.M}, K={self.K}, N={self.N}): {metrics}")

In [59]:
# 1 Random实验
M, N = 8, 10 
K = 0 
random_exp = Experiment(M, K, N, rt="Random")
random_exp.run()

Func loadData, run time: 0.750989
Func splitData, run time: 1.252777
Experiment 0: 
Metric:   {'Precision': 0.61, 'Recall': 0.29, 'Coverage': 100.0, 'Popularity': 4.395145}
Func worker, run time: 11.458412
Func splitData, run time: 1.047582
Experiment 1: 
Metric:   {'Precision': 0.6, 'Recall': 0.29, 'Coverage': 100.03, 'Popularity': 4.384132}
Func worker, run time: 12.814348
Func splitData, run time: 1.134164
Experiment 2: 
Metric:   {'Precision': 0.64, 'Recall': 0.3, 'Coverage': 100.0, 'Popularity': 4.38877}
Func worker, run time: 12.070603
Func splitData, run time: 1.195896
Experiment 3: 
Metric:   {'Precision': 0.61, 'Recall': 0.29, 'Coverage': 100.0, 'Popularity': 4.392672}
Func worker, run time: 11.994375
Func splitData, run time: 1.103245
Experiment 4: 
Metric:   {'Precision': 0.62, 'Recall': 0.3, 'Coverage': 100.0, 'Popularity': 4.394046}
Func worker, run time: 11.781370
Func splitData, run time: 1.077854
Experiment 5: 
Metric:   {'Precision': 0.56, 'Recall': 0.27, 'Coverage': 1

In [62]:
# 2 MostPopular实验
M, N = 8, 10 
K = 0 
random_exp = Experiment(M, K, N, rt="MostPopular")
random_exp.run()

Func loadData, run time: 0.771731
Func splitData, run time: 1.050749
Experiment 0: 
Metric:   {'Precision': 12.85, 'Recall': 6.17, 'Coverage': 2.47, 'Popularity': 7.724273}
Func worker, run time: 5.766155
Func splitData, run time: 1.294367
Experiment 1: 
Metric:   {'Precision': 13.07, 'Recall': 6.26, 'Coverage': 2.28, 'Popularity': 7.721385}
Func worker, run time: 5.852889
Func splitData, run time: 1.046569
Experiment 2: 
Metric:   {'Precision': 12.89, 'Recall': 6.16, 'Coverage': 2.44, 'Popularity': 7.722067}
Func worker, run time: 5.817492
Func splitData, run time: 1.036247
Experiment 3: 
Metric:   {'Precision': 12.81, 'Recall': 6.15, 'Coverage': 2.49, 'Popularity': 7.723152}
Func worker, run time: 5.754249
Func splitData, run time: 1.157221
Experiment 4: 
Metric:   {'Precision': 12.7, 'Recall': 6.11, 'Coverage': 2.47, 'Popularity': 7.724644}
Func worker, run time: 5.842018
Func splitData, run time: 1.117302
Experiment 5: 
Metric:   {'Precision': 12.9, 'Recall': 6.22, 'Coverage': 2.38

In [63]:
# 3 UserCF实验
M, N = 8, 10 
for K in [5, 10, 20, 40, 80, 160]:
    print("K = ", K)
    cf_exp = Experiment(M, K, N, rt="UserCF")
    cf_exp.run()

K =  5
Func loadData, run time: 0.818196
Func splitData, run time: 1.058739
Experiment 0: 
Metric:   {'Precision': 16.91, 'Recall': 8.12, 'Coverage': 52.36, 'Popularity': 6.819144}
Func worker, run time: 123.513309
Func splitData, run time: 1.093134
Experiment 1: 
Metric:   {'Precision': 17.05, 'Recall': 8.16, 'Coverage': 52.03, 'Popularity': 6.815604}
Func worker, run time: 123.114326
Func splitData, run time: 1.056307
Experiment 2: 
Metric:   {'Precision': 16.91, 'Recall': 8.09, 'Coverage': 51.69, 'Popularity': 6.818911}
Func worker, run time: 121.811897
Func splitData, run time: 1.039652
Experiment 3: 
Metric:   {'Precision': 16.95, 'Recall': 8.15, 'Coverage': 52.11, 'Popularity': 6.817778}
Func worker, run time: 120.892206
Func splitData, run time: 1.087227
Experiment 4: 
Metric:   {'Precision': 17.06, 'Recall': 8.2, 'Coverage': 52.14, 'Popularity': 6.821389}
Func worker, run time: 122.657266
Func splitData, run time: 1.050489
Experiment 5: 
Metric:   {'Precision': 16.76, 'Recall':

Func splitData, run time: 1.044936
Experiment 1: 
Metric:   {'Precision': 25.07, 'Recall': 12.0, 'Coverage': 15.43, 'Popularity': 7.359474}
Func worker, run time: 159.621178
Func splitData, run time: 1.067230
Experiment 2: 
Metric:   {'Precision': 24.94, 'Recall': 11.92, 'Coverage': 15.51, 'Popularity': 7.365722}
Func worker, run time: 159.590247
Func splitData, run time: 1.069144
Experiment 3: 
Metric:   {'Precision': 24.7, 'Recall': 11.87, 'Coverage': 15.57, 'Popularity': 7.367787}
Func worker, run time: 160.274154
Func splitData, run time: 1.075217
Experiment 4: 
Metric:   {'Precision': 24.54, 'Recall': 11.8, 'Coverage': 15.42, 'Popularity': 7.368638}
Func worker, run time: 160.784007
Func splitData, run time: 1.086666
Experiment 5: 
Metric:   {'Precision': 24.77, 'Recall': 11.95, 'Coverage': 15.52, 'Popularity': 7.370501}
Func worker, run time: 160.496461
Func splitData, run time: 1.060243
Experiment 6: 
Metric:   {'Precision': 24.9, 'Recall': 11.97, 'Coverage': 15.31, 'Popularity'

In [64]:
M, N = 8, 10 
K = 80 

iif_exp = Experiment(M, K, N, rt="UserIIF")
iif_exp.run()

Func loadData, run time: 0.782182
Func splitData, run time: 1.133177
Experiment 0: 
Metric:   {'Precision': 25.36, 'Recall': 12.18, 'Coverage': 21.33, 'Popularity': 7.26129}
Func worker, run time: 294.602093
Func splitData, run time: 1.068849
Experiment 1: 
Metric:   {'Precision': 25.5, 'Recall': 12.21, 'Coverage': 21.39, 'Popularity': 7.248747}
Func worker, run time: 297.410195
Func splitData, run time: 1.204592
Experiment 2: 
Metric:   {'Precision': 25.39, 'Recall': 12.14, 'Coverage': 21.33, 'Popularity': 7.255987}
Func worker, run time: 295.427124
Func splitData, run time: 1.186395
Experiment 3: 
Metric:   {'Precision': 25.08, 'Recall': 12.05, 'Coverage': 21.4, 'Popularity': 7.259753}
Func worker, run time: 290.514627
Func splitData, run time: 1.067862
Experiment 4: 
Metric:   {'Precision': 24.92, 'Recall': 11.98, 'Coverage': 21.25, 'Popularity': 7.261206}
Func worker, run time: 289.538990
Func splitData, run time: 1.077115
Experiment 5: 
Metric:   {'Precision': 25.14, 'Recall': 12.

# 实验结果

(1) Random实验

- Time: 105.416712

- Average Result: {'Precision': 0.61, 'Recall': 0.2925, 'Coverage': 100.0075, 'Popularity': 4.38941975}

(2) MostPopular实验

- Time: 56.971059

- Average Result: {'Precision': 12.832500000000001, 'Recall': 6.16125, 'Coverage': 2.42625, 'Popularity': 7.723263749999999}

(3) UserCF实验

> K = 5

- Time: 989.072319

- Average Result: 'Precision': 16.90125, 'Recall': 8.114999999999998, 'Coverage': 52.04, 'Popularity': 6.819473}

> K = 10

- Time: 986.891163

- Average Result: {'Precision': 20.465, 'Recall': 9.8275, 'Coverage': 41.6375, 'Popularity': 6.9791680000000005}

> K = 20

- Time: 1006.181191

- Average Result: {'Precision': 22.99125, 'Recall': 11.03875, 'Coverage': 32.778749999999995, 'Popularity': 7.1023555}


> K = 40

- Time: 1058.292992

- Average Result: {'Precision': 24.5375, 'Recall': 11.783749999999998, 'Coverage': 25.886249999999997, 'Popularity': 7.2022048750000005}


> K = 80

- Time: 1134.579971

- Average Result: {'Precision': 25.10875, 'Recall': 12.055, 'Coverage': 20.24625, 'Popularity': 7.28811625}

> K = 160

- Time: 1291.699050

- Average Result: {'Precision': 24.813750000000002, 'Recall': 11.91375, 'Coverage': 15.392499999999998, 'Popularity': 7.367553124999999}

(4) UserIIF实验

- Time: 2363.174543

- Average Result: {'Precision': 25.21625, 'Recall': 12.106250000000001, 'Coverage': 21.3175, 'Popularity': 7.2588870000000005}

# 总结

- 数据集分割时保证使用相同的`seed`，这样每次划分的结果相同

- 主要各个评价指标的实现

- 为每个用户推荐时，要去掉它们已经看过的

- 倒排电影-用户索引，时间优化

- 推荐电影的时候`K`表示相似的用户数，`N`表示推荐的电影数