In [1]:
import numpy as np 
import pandas as pd 
import math
import random
import time
from tqdm.autonotebook import tqdm, trange

from collections import defaultdict



# 定义通用函数

In [2]:
# 定义时间装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print("Func {:s}, run time: {:f}".format(func.__name__, stop_time - start_time))
        return res 
    return wrapper

# 数据处理相关

In [31]:
with open("data/ml-1m/movies.dat", 'r', encoding="latin1") as f: 
    for l in f:
        print(l)
        break

1::Toy Story (1995)::Animation|Children's|Comedy



- 对应 `movie_id`, `movie_name`, `type`

In [89]:
class Dataset:
    def __init__(self, fp, ip):
        self.data, self.content = self.loadData(fp, ip)
        
    @timmer
    def loadData(self, fp, ip):
        data = []
        for l in open(fp):
            data.append(tuple(map(int, l.strip().split('::')[:2])))
        
        contents = {}    
        for l in open(ip, "rb"):
            l = str(l)[2:-1]
            contents[int(l.strip().split("::")[0])] = l.strip().split("::")[-1].split("|")
        return data, contents
    
    @timmer 
    def splitData(self, M, k, seed=2019):
        train, test = [], []
        random.seed(seed)
        for user, item in self.data:
            if random.randint(0, M-1) == k: 
                test.append((user, item))
            else:
                train.append((user, item))
        
        # 处理成字典形式，user->set(items)
        def convert_dict(data):
            data_dict = {}
            for user, item in data:
                if user not in data_dict:
                    data_dict[user] = set()
                data_dict[user].add(item)
            data_dict = {k: list(data_dict[k]) for k in data_dict}
            return data_dict
        return convert_dict(train), convert_dict(test), self.content

# 评价指标

In [95]:
class Metric:
    
    def __init__(self, train, test, GetRecommendation):
        '''
        GetRecommendation: 为某个用户推荐物品的接口函数，返回值为电影id组成的list
        '''
        self.train = train
        self.test = test 
        self.GetRecommendation = GetRecommendation
        # 保存为测试集用户推荐结果的值
        self.recs = self.getRec()
        
    # 为test中的每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs
    
    ## 定义精确率指标
    def precision(self):
        all, hit = 0, 0 
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1 
            all += len(rank)
        return round(hit / all * 100, 2)
    
    ## 定义召回率指标
    def recall(self):
        all, hit = 0, 0 
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank: 
                if item in test_items:
                    hit += 1 
            all += len(test_items)
        return round(hit / all * 100, 2)
    
    ## 定义覆盖率指标
    def coverage(self):
        all_item, recom_item = set(), set()
        for user in self.test:
            for item in self.train[user]:
                all_item.add(item)
            rank = self.recs[user]
            for item, score in rank: 
                recom_item.add(item)
                
        return round(len(recom_item) / len(all_item) * 100, 2)
    
    ## 定义新颖度指标
    def popularity(self):
        ## 计算推荐物品的平均流行度
        item_pop = {}
        for user in self.train: 
            for item in self.train[user]:
                item_pop[item] = item_pop.get(item, 0) + 1 
        num, pop = 0, 0 
        for user in self.test: 
            rank = self.recs[user]
            for item, score in rank:
                if item in item_pop:
                # 取对数，防止长尾问题带来被流行物品主导的问题
                    pop += math.log(1+item_pop[item])
                    num += 1 
        return round(pop / num, 6)
    
    def eval(self):
        metric = {
            "Precision": self.precision(),
            "Recall": self.recall(),
            "Coverage": self.coverage(),
            "Popularity": self.popularity()
        }
        print("Metric:  ", metric)
        return metric

# ContentItemKNN算法实现

In [96]:
def ContentItemKNN(train, content, K, N):
    # 建立word-item的倒排表
    word_item = {}
    for item in content:
        for word in content[item]:
            if word not in word_item:
                word_item[word] = {}
            word_item[word][item] = 1 
    
    # 对于每一步电影，初始化对应的content值
    for word in word_item:
        for item in word_item[word]:
            word_item[word][item] /= math.log(1 + len(word_item[word]))
    
    # 计算相似度
    item_sim = {}
    mo = {}
    for word in word_item:
        ## 对于每一部电影
        for u in word_item[word]:
            if u not in item_sim:
                item_sim[u] = {}
                mo[u] = 0 
            mo[u] += word_item[word][u] ** 2 
            # 对于其他电影
            for v in word_item[word]:
                if u==v: 
                    continue
                if v not in item_sim[u]:
                    item_sim[u][v] = 0 
                item_sim[u][v] += word_item[word][u] * word_item[word][v]
    for u in item_sim:
        for v in item_sim[u]:
            item_sim[u][v] /= math.sqrt(mo[u]*mo[v])
    
    # 按照相似度排序
    sorted_item_sim = {k: list(sorted(v.items(), key=lambda x: x[1], reverse=True)) for k, v, in item_sim.items()}
    #print(list(sorted_item_sim.keys())[:10])
    #print(list(sorted_item_sim.values())[:10])
    
    # 获取接口函数
    def GetRecommendation(user):
        items = {}
        seen_items = set(train[user])
        for item in train[user]:
            for u, _ in sorted_item_sim[item][:K]:
                # 去掉用户看过的
                if u not in seen_items:
                    if u not in items:
                        items[u] = 0 
                    items[u] += item_sim[item][u]
        recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N]
        #print(recs)
        return recs
    return GetRecommendation

# 定义实验

In [97]:
class Experiment:
    def __init__(self, M, N, K, fp="data/ml-1m/ratings.dat",  ip="data/ml-1m/movies.dat"):
        self.M = M 
        self.K = K
        self.N = N
        self.fp = fp 
        self.ip = ip 
        self.alg = ContentItemKNN
    
    # 定义单次实验
    @timmer
    def worker(self, train, test, content):
        getRecommendation = self.alg(train, content, self.K, self.N)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()
    
    # 多次实验取平均
    def run(self):
        metrics = {"Precision": 0, "Recall": 0,
                  "Coverage": 0, "Popularity": 0}
        dataset = Dataset(self.fp, self.ip)
        #print(dataset.data[:10])
        for ii in range(self.M):
            train, test, content = dataset.splitData(self.M, ii)
            #print(list(train.keys())[:10])
            print(f"Experiment {ii}: ")
            metric = self.worker(train, test, content)
            metrics = {k: metrics[k]+metric[k] for k in metrics}
        metrics = {k: metrics[k] / self.M for k in metrics}
        print("Average Result (M={}, N={}, K={}): {}".format(self.M, self.N, self.K, metrics))

# 实验

In [98]:
M, N, K = 8, 10, 10 
exp = Experiment(M, N, K)
exp.run()

Func loadData, run time: 0.813890
Func splitData, run time: 1.393228
Experiment 0: 
Metric:   {'Precision': 1.85, 'Recall': 0.89, 'Coverage': 16.15, 'Popularity': 4.607609}
Func worker, run time: 3.893879
Func splitData, run time: 1.059000
Experiment 1: 
Metric:   {'Precision': 1.78, 'Recall': 0.85, 'Coverage': 16.58, 'Popularity': 4.617612}
Func worker, run time: 3.884301
Func splitData, run time: 1.297688
Experiment 2: 
Metric:   {'Precision': 1.81, 'Recall': 0.87, 'Coverage': 16.73, 'Popularity': 4.604153}
Func worker, run time: 3.912960
Func splitData, run time: 1.058687
Experiment 3: 
Metric:   {'Precision': 1.8, 'Recall': 0.87, 'Coverage': 16.81, 'Popularity': 4.615706}
Func worker, run time: 3.862634
Func splitData, run time: 1.251456
Experiment 4: 
Metric:   {'Precision': 1.79, 'Recall': 0.86, 'Coverage': 16.97, 'Popularity': 4.660526}
Func worker, run time: 3.885685
Func splitData, run time: 1.039999
Experiment 5: 
Metric:   {'Precision': 1.85, 'Recall': 0.89, 'Coverage': 16.3