In [1]:
import random 
import math 
import time 
from tqdm import tqdm 

# 通用函数定义

In [3]:
# 定义装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print("Func: {:s} | Run time: %s".format(func.__name__, stop_time - start_time))
        return res 
    return wrapper

# 数据处理相关

In [4]:
class Data:
    def __init__(self, user, item, rate, test=False, predict=0.0):
        self.user = user
        self.item = item
        self.rate = rate 
        self.test = test 
        self.predict = predict

In [5]:
class Dataset:
    def __init__(self, fp):
        self.data = self.loadData(fp)
    
    def loadData(self, fp):
        data = []
        for l in open(fp):
            data.append(tuple(map(int, l.strip().split("::")[:3])))
        data = [Data(*d) for d in data]
        return data 
    
    def splitData(self, M, k, seed=1):
        random.seed(seed)
        for i in range(len(self.data)):
            if random.randint(0, M-1) == k: 
                self.data[i].test = True

# 评价指标

In [6]:
def RMSE(records):
    rmse = {"train_rmse": [], "test_rmse": []}
    for r in records:
        if r.test:
            rmse["test_rmse"].append((r.rate - r.predict)**2)
        else:
            rmse["train_rmse"].append((r.rate - r.predict)**2)
    rmse = {"train_rmse": math.sqrt(sum(rmse["train_rmse"]) / len(rmse["train_rmse"])),
           "test_rmse": math.sqrt(sum(rmse["test_rmse"]) / len(rmse["test_rmse"]))}
    return rmse

# 算法实现

- Cluster
- IdCluster
- UserActivityCluster
- ItemPopularityCluster
- UserVoteCluster
- ItemVoteCluster

In [7]:
# 1. Cluster
class Cluster:
    def __init__(self, records):
        self.group = {}
    
    def GetGroup(self, i):
        return 0

In [8]:
# 2. IdCluster
class IdCluster(Cluster):
    def __init__(self, records):
        Cluster.__init__(self, records)
    
    def GetGroup(self, i):
        return i

In [9]:
# 3. UserActivityCluster
class UserActivityCluster(Cluster):
    def __init__(self, records):
        Cluster.__init__(self, records)
        activity = {}
        for r in records:
            if r.test: continue
            if r.user not in activity:
                activity[r.user] = 0 
            activity[r.user] += 1 
        
        # 按照用户活跃度进行分组
        k = 0 
        for user, n in sorted(activity.items(), key=lambda x: x[-1], reverse=False):
            c = int((k*5)/len(activity))
            k += 1 
    
    def GetGroup(self, uid):
        if uid not in self.group:
            return -1 
        else:
            return self.group[uid]
        

In [10]:
# 4. ItemPopularity
class ItemPopularityCluster(Cluster):
    def __init__(self, records):
        Cluster.__init__(self, records)
        popularity = {}
        for r in records:
            if r.test: continue
            if r.item not in popularity:
                popularity[r.item] = 0 
            popularity[r.item] += 1 
        
        # 按照物品流行度进行分组
        k = 0 
        for item, n in sorted(popularity.items(), key=lambda x: x[-1], reverse=False):
            c = int((k*5)/len(popularity))
            self.group[item] = c 
            k += 1 
    
    def GetGroup(self, iid):
        if iid not in self.group:
            return -1 
        else:
            return self.group[iid]

In [16]:
# 5. UserVoteCluster
class UserVoteCluster(Cluster):
    def __init__(self, records):
        Cluster.__init__(self, records)
        vote, cnt = {}, {} 
        for r in records:
            if r.test: continue
            vote[r.user] = vote.get(r.user, 0) + r.rate
            cnt[r.user] = cnt.get(r.user, 0) + 1 
        # 按照物品平均评分进行分组
        for user, v in vote.items():
            c = v / (cnt[user] * 1.0 )
            self.group[user] = int(c*2)
        
    def GetGroup(self, uid):
        if uid not in self.group:
            return -1 
        else:
            return self.group[uid]

In [23]:
# 6. ItemVoteCluster
class ItemVoteCluster(Cluster):
    def __init__(self, records):
        Cluster.__init__(self, records)
        vote, cnt = {}, {} 
        for r in records:
            if r.test: continue
            vote[r.item] = vote.get(r.item, 0) + r.rate
            cnt[r.item] = cnt.get(r.item, 0) + 1 
        
        # 按照物品平均评分进行分组
        for item, v in vote.items():
            c = v / (cnt[item] * 1.0)
            self.group[item] = int(c*2)
        
    def GetGroup(self, iid):
        if iid not in self.group:
            return -1 
        else:
            return self.group[iid]

In [24]:
# 返回预测接口函数
def PredictAll(records, UserGroup, ItemGroup):
    '''
    records: 数据集 
    UserGroup: 用户分组类 
    ItemGroup: 物品分组类 
    '''
    userGroup = UserGroup(records)
    itemGroup = ItemGroup(records)
    group = {}
    for r in records:
        ug = userGroup.GetGroup(r.user)
        ig = itemGroup.GetGroup(r.item)
        if ug not in group:
            group[ug] = {}
        if ig not in group[ug]:
            group[ug][ig] = []
        group[ug][ig].append(r.rate)
    
    for ug in group:
        for ig in group[ug]:
            group[ug][ig] = sum(group[ug][ig]) / (1.0 * len(group[ug][ig]) + 1.0)
    
    # 预测
    for r in records:
        ug = userGroup.GetGroup(r.user)
        ig = itemGroup.GetGroup(r.item)
        r.predict = group[ug][ig]

# 实验

In [25]:
class Experiment:
    def __init__(self, M, UserGroup, ItemGroup, fp="data/ml-1m/ratings.dat"):
        self.M = M 
        self.userGroup = UserGroup
        self.itemGroup = ItemGroup
        self.fp = fp 
    
    # 定义单次实验
    def worker(self, records):
        PredictAll(records, self.userGroup, self.itemGroup)
        metric = RMSE(records)
        return metric
    
    # 多次实验取平均
    def run(self):
        dataset = Dataset(self.fp)
        dataset.splitData(self.M, 0)
        metric = self.worker(dataset.data)
        print("Result (UserGroup={}, ItemGroup={}): {}".format(self.userGroup.__name__,
                                                              self.itemGroup.__name__,
                                                              metric))

In [26]:
UserGroups = [Cluster, IdCluster, Cluster, UserActivityCluster, UserActivityCluster, Cluster, IdCluster,
             UserActivityCluster, UserVoteCluster, UserVoteCluster, Cluster, IdCluster, UserVoteCluster]
ItemGroups = [Cluster, Cluster, IdCluster, Cluster, IdCluster, ItemPopularityCluster, ItemPopularityCluster,
             ItemPopularityCluster, Cluster, IdCluster, ItemVoteCluster, ItemVoteCluster, ItemVoteCluster]

M = 10 
for i in range(len(UserGroups)):
    exp = Experiment(M, UserGroups[i], ItemGroups[i])
    exp.run()

Result (UserGroup=Cluster, ItemGroup=Cluster): {'train_rmse': 1.1163419044799097, 'test_rmse': 1.1239098429180596}
Result (UserGroup=IdCluster, ItemGroup=Cluster): {'train_rmse': 1.0285654793409407, 'test_rmse': 1.0334592269465828}
Result (UserGroup=Cluster, ItemGroup=IdCluster): {'train_rmse': 0.9748185505735479, 'test_rmse': 0.9798641288094841}
Result (UserGroup=UserActivityCluster, ItemGroup=Cluster): {'train_rmse': 1.1163419044799097, 'test_rmse': 1.1239098429180596}
Result (UserGroup=UserActivityCluster, ItemGroup=IdCluster): {'train_rmse': 0.9748185505735479, 'test_rmse': 0.9798641288094841}
Result (UserGroup=Cluster, ItemGroup=ItemPopularityCluster): {'train_rmse': 1.089464041024945, 'test_rmse': 1.0960645954595978}
Result (UserGroup=IdCluster, ItemGroup=ItemPopularityCluster): {'train_rmse': 0.9992846921965436, 'test_rmse': 1.004305812819349}
Result (UserGroup=UserActivityCluster, ItemGroup=ItemPopularityCluster): {'train_rmse': 1.089464041024945, 'test_rmse': 1.096064595459597