In [7]:
import numpy as np 
import pandas as pd 
import math
import random
import time
from tqdm.autonotebook import tqdm, trange

from collections import defaultdict

# 定义装饰器函数

In [2]:
# 定义时间装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print("Func {:s}, run time: {:f}".format(func.__name__, stop_time - start_time))
        return res 
    return wrapper

# 数据处理相关

In [5]:
with open("data/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", 'r') as f: 
    for line in f: 
        print(line)
        break

00000c289a1829a808ac09c00daf10bc3c4e223b	3bd73256-3905-4f3a-97e2-8b341527f805	betty blowtorch	2137



- 分别表示`user_id`, `item_id`, `title`

In [6]:
with open("data/lastfm-dataset-360K/usersha1-profile.tsv", 'r') as f: 
    for line in f: 
        print(line)
        break

00000c289a1829a808ac09c00daf10bc3c4e223b	f	22	Germany	Feb 1, 2007



- 分别表示 `user_id`, `gender`, `age`, `country`, `time`

In [27]:
# 定义数据处理函数
class Dataset:
    def __init__(self, fp, up):
        # fp: 数据文件的路径
        # up: 用户文件路径
        self.data, self.profile = self.loadData(fp, up)
        
    @timmer 
    def loadData(self, fp, up):
        data = []
        with open(fp, 'r') as f: 
            for l in f: 
                data.append(tuple(l.strip().split('\t')[:2]))
        
        profile = {} 
        with open(up, "r") as f: 
            for l in f: 
                user, gender, age, country, _ = l.strip().split('\t')
                try:
                    int(age)
                except:
                    age = -1
            
                profile[user] = {"gender": gender, "age": int(age), "country":  country}
                
        # 按照用户采样
        users = list(profile.keys())
        random.shuffle(users)
        users = set(users[:5000])
        data = [x for x in data if x[0] in users]
        profile = {k: profile[k]  for k in users}
        
        return data, profile 
    
    @timmer
    def splitData(self, M, k, seed=1):
        train, test = [], []
        random.seed(seed)
        
        for user, item in self.data: 
            if random.randint(0, M-1) == k: 
                test.append((user, item))
            else:
                train.append((user, item))
        
        # 处理成字典的形式，user->set(items)
        def convert_dict(data):
            data_dict = defaultdict(set)
            for user, item in data:
                data_dict[user].add(item)
            
            data_dict = {k: list(data_dict[k]) for k in data_dict}
            return data_dict
        
        return convert_dict(train), convert_dict(test), self.profile
                

# 定义评价指标函数

In [47]:
class Metric:
    
    def __init__(self, train, test, GetRecommendation):
        '''
        GetRecommendation: 为某个用户推荐物品的接口函数，返回值为电影id组成的list
        '''
        self.train = train
        self.test = test 
        self.GetRecommendation = GetRecommendation
        # 保存为测试集用户推荐结果的值
        self.recs = self.getRec()
        
    # 为test中的每个用户进行推荐
    def getRec(self):
        recs = {}
        for i, user in enumerate(self.test):
            #print(i, user)
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs
    
    ## 定义精确率指标
    def precision(self):
        all, hit = 0, 0 
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1 
            all += len(rank)
        return round(hit / all * 100, 2)
    
    ## 定义召回率指标
    def recall(self):
        all, hit = 0, 0 
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank: 
                if item in test_items:
                    hit += 1 
            all += len(test_items)
        return round(hit / all * 100, 2)
    
    ## 定义覆盖率指标
    def coverage(self):
        all_item, recom_item = set(), set()
        for user in self.test:
            if user in self.train:
                for item in self.train[user]:
                    all_item.add(item)
            rank = self.recs[user]
            for item, score in rank: 
                recom_item.add(item)
                
        return round(len(recom_item) / len(all_item) * 100, 2)
    
    ## 定义新颖度指标
    def popularity(self):
        ## 计算推荐物品的平均流行度
        item_pop = {}
        for user in self.train: 
            for item in self.train[user]:
                item_pop[item] = item_pop.get(item, 0) + 1 
        num, pop = 0, 0 
        for user in self.test: 
            rank = self.recs[user]
            for item, score in rank:
                # 取对数，防止长尾问题带来被流行物品主导的问题
                pop += math.log(1+item_pop[item])
                num += 1 
        return round(pop / num, 6)
    
    def eval(self):
        metric = {
            "Precision": self.precision(),
            "Recall": self.recall(),
            "Coverage": self.coverage()
        }
        print("Metric:  ", metric)
        return metric

# 算法实现

- MostPopular
- GenderMostPopular
- AgeMostPopular
- CountryMostPopular
- DemographicMostPopular

## MostPopular算法

In [11]:
def MostPopular(train, profile, N):
    items = {} 
    for user in train: 
        for item in train[user]:
            items[item] = items.get(item, 0) + 1 
    
    # 按照对商品用过行为的用户数排名
    items = list(sorted(items.items(), key=lambda x: x[1], reverse=True))
    
    # 获取接口函数
    def GetRecommendation(user):
        seen_items = set(train[user]) if user in train else set()
        recs = [x for x in items if x[0] not in seen_items][:N]
        return recs
    
    return GetRecommendation

## GenderMostPopular算法

In [12]:
def GenderMostPopular(train, profile, N):
    # 分别表示男、女
    mitems, fitems = {}, {} 
    for user in train: 
        if profile[user]['gender'] == 'm':
            tmp = mitems
        elif profile[user]['gender'] == 'f':
            tmp = fitems
        
        for item in train[user]:
            tmp[item] = tmp.get(item, 0) + 1 
    mitems = list(sorted(mitems.items(), key=lambda x: x[1], reverse=True))
    fitems = list(sorted(fitems.items(), key=lambda x: x[1], reverse=True))
    
    mostPopular = MostPopular(train, profile, N)
    
    # 定义接口函数
    def GetRecommendation(user):
        seen_items = set(train[user]) if user in train else set()
        if profile[user]['gender'] == 'm':
            recs = [x for x in mitems if x[0] not in seen_items][:N]
        elif profile[user]['gender'] == 'f':
            recs = [x for x in fitems if x[0] not in seen_items][:N]
        ## 没有提供性别信息
        else:
            recs = mostPopular(user)
        return recs
    
    return GetRecommendation

## AgeMostPopular算法

In [58]:
def AgeMostPopular(train, profile, N):
    # 对年龄进行分段
    ages = []
    for user in profile:
        if profile[user]['age'] >= 0: 
            ages.append(profile[user]['age'])
    maxAge, minAge = max(ages), min(ages)
    #print(maxAge, minAge)
    items = [{} for _ in range(int(maxAge//10 + 1))]
    #print("done")
    
    # 分年龄段进行统计
    for user in train:
        if profile[user]['age'] >= 0: 
            age = profile[user]['age'] // 10 
            for item in train[user]:
                items[age][item] = items[age].get(item, 0) + 1 
    #print("done!")
    for i in range(len(items)):
        items[i] = list(sorted(items[i].items(), key=lambda x: x[1], reverse=True))
    
    #print(len(items))
    mostPopular = MostPopular(train, profile, N)
    
    # 获取接口函数
    def GetRecommendation(user):
       # print(1)
        seen_items = set(train[user]) if user in train else set()
        #print(2)
        if profile[user]['age'] >= 0: 
            #print(3, profile[user]['age'])
            age = profile[user]["age"] // 10 
            # 如果年龄信息异常
            if age >= len(items) or len(items[age]) == 0: 
                recs = mostPopular(user)
            else:
                recs = [x for x in items[age] if x[0] not in seen_items][:N]
        else:
            recs = mostPopular(user)
        return recs    
        
    return GetRecommendation

## CountryMostPopular算法

In [15]:
def CountryMostPopular(train, profile, N):
    # 分城市进行
    items = defaultdict(dict)
    for user in train:
        country = profile[user]['country']
        for item in train[user]:
            items[country][item] = items[country].get(item, 0) + 1 
    
    for country in items:
        items[country] = list(sorted(items[country].items(), key=lambda x: x[1], reverse=True))
        
    mostPopular = MostPopular(train, profile, N)
    
    ## 获取接口函数
    def GetRecommendation(user):
        seen_items = set(train[user]) if user in train else set()
        country = profile[user]['country']
        
        if country in items:
            recs = [x for x in items[country] if x[0] not in seen_items][:N]
        else:
            recs = mostPopular(user)
        return recs
    
    return GetRecommendation

## DemographicMostPopular算法

In [67]:
def DemographicMostPopular(train, profile, N):
    # 建立多重字典，将缺失值当做other
    items = {}
    for user in train: 
        gender = profile[user]['gender']
        if gender:
            if gender not in items:
                items[gender] = {}
            
            age = profile[user]['age'] // 10
            if age >= 0:
                if age not in items[gender]:
                    items[gender][age] = {}

                country = profile[user]["country"]
                if country:
                    if country not in items[gender][age]:
                        items[gender][age][country] = {}
                    for item in train[user]:
                        items[gender][age][country][item] = items[gender][age][country].get(item, 0) + 1
    
    for gender in items:
        for age in items[gender]:
            for country in items[gender][age]:
                items[gender][age][country] = list(sorted(items[gender][age][country].items(), 
                                                          key=lambda x: x[1], reverse=True))

    #print(items.keys())
    #print(items['f'].keys())
    
    mostPopular = MostPopular(train, profile, N)
    
    def GetRecommendation(user):
        seen_items = set(train[user]) if user in train else set()
        gender = profile[user]['gender']
        age = profile[user]['age'] // 10 
        country = profile[user]['country']
        if (gender not in items) or (age not in items[gender]) or (country not in items[gender][age]):
            recs = mostPopular(user)
        else:
            recs = [x for x in items[gender][age][country] if x[0] not in seen_items][:N]
        #print(n)
        return recs
    return GetRecommendation

# 实验

- M=10, N=10

In [24]:
class Experiment:
    def __init__(self, M, N, at="MostPopular", 
                fp="data/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", 
                up='data/lastfm-dataset-360K/usersha1-profile.tsv'):
        self.M = M 
        self.N = N 
        self.fp = fp 
        self.up = up 
        self.at = at 
        self.alg = {"MostPopular": MostPopular, "GenderMostPopular":GenderMostPopular,
                   "AgeMostPopular": AgeMostPopular, "CountryMostPopular": CountryMostPopular,
                   "DemographicMostPopular": DemographicMostPopular}
        
    # 定义单次实验
    @timmer
    def worker(self, train, test, profile):
        getRecommendation = self.alg[self.at](train, profile, self.N)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()
    
    # 定义多次实验
    @timmer
    def run(self):
        metrics = {"Precision": 0, "Recall": 0, "Coverage": 0}
        dataset = Dataset(self.fp, self.up)
        for ii in range(self.M):
            train, test, profile = dataset.splitData(self.M, ii)
            print(f"Experiment {ii}")
            metric = self.worker(train, test, profile)
            metrics = {k: metrics[k]+metric[k] for k in metrics}
        metrics = {k: metrics[k] / self.M for k in metrics}
        print("Average Result (M={}, N={})".format(self.M, self.N, metrics))

In [23]:
M, N = 10, 10 
most_exp = Experiment(M, N, at="MostPopular")
most_exp.run()

Func loadData, run time: 13.203743
Func splitData, run time: 0.275689
Experiment 0
Metric:   {'Precision': 2.28, 'Recall': 4.62, 'Coverage': 0.07, 'Popularity': 6.475479}
Func worker, run time: 10.254862
Func splitData, run time: 0.252894
Experiment 1
Metric:   {'Precision': 2.29, 'Recall': 4.63, 'Coverage': 0.07, 'Popularity': 6.471694}
Func worker, run time: 10.578079
Func splitData, run time: 0.306388
Experiment 2
Metric:   {'Precision': 2.3, 'Recall': 4.64, 'Coverage': 0.07, 'Popularity': 6.473073}
Func worker, run time: 10.456345
Func splitData, run time: 0.257086
Experiment 3
Metric:   {'Precision': 2.23, 'Recall': 4.49, 'Coverage': 0.07, 'Popularity': 6.471887}
Func worker, run time: 10.478497
Func splitData, run time: 0.490370
Experiment 4
Metric:   {'Precision': 2.36, 'Recall': 4.81, 'Coverage': 0.07, 'Popularity': 6.469927}
Func worker, run time: 10.022021
Func splitData, run time: 0.254269
Experiment 5
Metric:   {'Precision': 2.21, 'Recall': 4.5, 'Coverage': 0.07, 'Popularit

In [25]:
M, N  = 10, 10 
gender_exp = Experiment(M, N, at="GenderMostPopular")
gender_exp.run()

Func loadData, run time: 13.026316
Func splitData, run time: 0.291794
Experiment 0
Metric:   {'Precision': 2.43, 'Recall': 4.92, 'Coverage': 0.1, 'Popularity': 6.448209}
Func worker, run time: 9.096047
Func splitData, run time: 0.259650
Experiment 1
Metric:   {'Precision': 2.34, 'Recall': 4.73, 'Coverage': 0.1, 'Popularity': 6.456241}
Func worker, run time: 8.865354
Func splitData, run time: 0.263320
Experiment 2
Metric:   {'Precision': 2.45, 'Recall': 4.95, 'Coverage': 0.1, 'Popularity': 6.447781}
Func worker, run time: 9.162814
Func splitData, run time: 0.258905
Experiment 3
Metric:   {'Precision': 2.3, 'Recall': 4.64, 'Coverage': 0.1, 'Popularity': 6.45671}
Func worker, run time: 11.631591
Func splitData, run time: 0.527676
Experiment 4
Metric:   {'Precision': 2.41, 'Recall': 4.9, 'Coverage': 0.11, 'Popularity': 6.453743}
Func worker, run time: 9.506752
Func splitData, run time: 0.258272
Experiment 5
Metric:   {'Precision': 2.39, 'Recall': 4.88, 'Coverage': 0.1, 'Popularity': 6.4530

In [60]:
M, N  = 10, 10 
age_exp = Experiment(M, N, at="AgeMostPopular")
age_exp.run()

Func loadData, run time: 12.897695
Func splitData, run time: 0.274037
Experiment 0
Metric:   {'Precision': 2.49, 'Recall': 5.04, 'Coverage': 0.39}
Func worker, run time: 7.010049
Func splitData, run time: 0.268412
Experiment 1
Metric:   {'Precision': 2.44, 'Recall': 4.92, 'Coverage': 0.39}
Func worker, run time: 7.147290
Func splitData, run time: 0.261057
Experiment 2
Metric:   {'Precision': 2.31, 'Recall': 4.66, 'Coverage': 0.37}
Func worker, run time: 7.330279
Func splitData, run time: 1.182984
Experiment 3
Metric:   {'Precision': 2.27, 'Recall': 4.58, 'Coverage': 0.38}
Func worker, run time: 7.382041
Func splitData, run time: 0.256541
Experiment 4
Metric:   {'Precision': 2.32, 'Recall': 4.73, 'Coverage': 0.37}
Func worker, run time: 7.287331
Func splitData, run time: 0.263654
Experiment 5
Metric:   {'Precision': 2.33, 'Recall': 4.75, 'Coverage': 0.39}
Func worker, run time: 7.393450
Func splitData, run time: 0.266589
Experiment 6
Metric:   {'Precision': 2.44, 'Recall': 4.92, 'Covera

In [49]:
M, N  = 10, 10 
country_exp = Experiment(M, N, at="CountryMostPopular")
country_exp.run()

Func loadData, run time: 13.055334
Func splitData, run time: 0.281766
Experiment 0
Metric:   {'Precision': 2.5, 'Recall': 5.04, 'Coverage': 2.27}
Func worker, run time: 1.675560
Func splitData, run time: 0.258156
Experiment 1
Metric:   {'Precision': 2.46, 'Recall': 4.94, 'Coverage': 2.27}
Func worker, run time: 1.704506
Func splitData, run time: 0.259305
Experiment 2
Metric:   {'Precision': 2.44, 'Recall': 4.88, 'Coverage': 2.29}
Func worker, run time: 1.704397
Func splitData, run time: 0.478367
Experiment 3
Metric:   {'Precision': 2.45, 'Recall': 4.88, 'Coverage': 2.3}
Func worker, run time: 1.699900
Func splitData, run time: 0.261493
Experiment 4
Metric:   {'Precision': 2.63, 'Recall': 5.32, 'Coverage': 2.3}
Func worker, run time: 1.728215
Func splitData, run time: 0.262737
Experiment 5
Metric:   {'Precision': 2.52, 'Recall': 5.11, 'Coverage': 2.24}
Func worker, run time: 1.739908
Func splitData, run time: 0.261833
Experiment 6
Metric:   {'Precision': 2.35, 'Recall': 4.69, 'Coverage'

In [68]:
M, N  = 10, 10 
demo_exp = Experiment(M, N, at="DemographicMostPopular")
demo_exp.run()

Func loadData, run time: 12.509141
Func splitData, run time: 0.276548
Experiment 0
Metric:   {'Precision': 2.15, 'Recall': 4.24, 'Coverage': 6.39}
Func worker, run time: 3.183590
Func splitData, run time: 0.255671
Experiment 1
Metric:   {'Precision': 2.22, 'Recall': 4.37, 'Coverage': 6.3}
Func worker, run time: 3.353138
Func splitData, run time: 0.259262
Experiment 2
Metric:   {'Precision': 2.3, 'Recall': 4.52, 'Coverage': 6.37}
Func worker, run time: 3.314888
Func splitData, run time: 0.265179
Experiment 3
Metric:   {'Precision': 2.16, 'Recall': 4.23, 'Coverage': 6.34}
Func worker, run time: 3.358466
Func splitData, run time: 0.251946
Experiment 4
Metric:   {'Precision': 2.17, 'Recall': 4.31, 'Coverage': 6.23}
Func worker, run time: 3.325599
Func splitData, run time: 0.704509
Experiment 5
Metric:   {'Precision': 2.09, 'Recall': 4.15, 'Coverage': 6.4}
Func worker, run time: 3.211060
Func splitData, run time: 0.255199
Experiment 6
Metric:   {'Precision': 2.12, 'Recall': 4.16, 'Coverage'