In [1]:
import random 
import math 
import time 
from tqdm import tqdm 

In [10]:
# 定义装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print("Func: {:s} | Run time: {:.2f}".format(func.__name__, stop_time - start_time))
        return res 
    return wrapper

In [3]:
class Data:
    def __init__(self, user, item, rate, test=False, predict=0.0):
        self.user = user
        self.item = item
        self.rate = rate 
        self.test = test 
        self.predict = predict

class Dataset:
    def __init__(self, fp):
        self.data = self.loadData(fp)
    
    def loadData(self, fp):
        data = []
        for l in open(fp):
            data.append(tuple(map(int, l.strip().split("::")[:3])))
        data = [Data(*d) for d in data]
        return data 
    
    def splitData(self, M, k, seed=1):
        random.seed(seed)
        for i in range(len(self.data)):
            if random.randint(0, M-1) == k: 
                self.data[i].test = True

In [4]:
def RMSE(records):
    rmse = {"train_rmse": [], "test_rmse": []}
    for r in records:
        if r.test:
            rmse["test_rmse"].append((r.rate - r.predict)**2)
        else:
            rmse["train_rmse"].append((r.rate - r.predict)**2)
    rmse = {"train_rmse": math.sqrt(sum(rmse["train_rmse"]) / len(rmse["train_rmse"])),
           "test_rmse": math.sqrt(sum(rmse["test_rmse"]) / len(rmse["test_rmse"]))}
    return rmse

In [5]:
# 1. Cluster
class Cluster:
    def __init__(self, records):
        self.group = {}
    
    def GetGroup(self, i):
        return 0


# 2. IdCluster
class IdCluster(Cluster):
    def __init__(self, records):
        Cluster.__init__(self, records)
    
    def GetGroup(self, i):
        return i

    
# 3. UserActivityCluster
class UserActivityCluster(Cluster):
    def __init__(self, records):
        Cluster.__init__(self, records)
        activity = {}
        for r in records:
            if r.test: continue
            if r.user not in activity:
                activity[r.user] = 0 
            activity[r.user] += 1 
        
        # 按照用户活跃度进行分组
        k = 0 
        for user, n in sorted(activity.items(), key=lambda x: x[-1], reverse=False):
            c = int((k*5)/len(activity))
            k += 1 
    
    def GetGroup(self, uid):
        if uid not in self.group:
            return -1 
        else:
            return self.group[uid]

# 4. ItemPopularity
class ItemPopularityCluster(Cluster):
    def __init__(self, records):
        Cluster.__init__(self, records)
        popularity = {}
        for r in records:
            if r.test: continue
            if r.item not in popularity:
                popularity[r.item] = 0 
            popularity[r.item] += 1 
        
        # 按照物品流行度进行分组
        k = 0 
        for item, n in sorted(popularity.items(), key=lambda x: x[-1], reverse=False):
            c = int((k*5)/len(popularity))
            self.group[item] = c 
            k += 1 
    
    def GetGroup(self, iid):
        if iid not in self.group:
            return -1 
        else:
            return self.group[iid]

# 5. UserVoteCluster
class UserVoteCluster(Cluster):
    def __init__(self, records):
        Cluster.__init__(self, records)
        vote, cnt = {}, {} 
        for r in records:
            if r.test: continue
            vote[r.user] = vote.get(r.user, 0) + r.rate
            cnt[r.user] = cnt.get(r.user, 0) + 1 
        # 按照物品平均评分进行分组
        for user, v in vote.items():
            c = v / (cnt[user] * 1.0 )
            self.group[user] = int(c*2)
        
    def GetGroup(self, uid):
        if uid not in self.group:
            return -1 
        else:
            return self.group[uid]

# 6. ItemVoteCluster
class ItemVoteCluster(Cluster):
    def __init__(self, records):
        Cluster.__init__(self, records)
        vote, cnt = {}, {} 
        for r in records:
            if r.test: continue
            vote[r.item] = vote.get(r.item, 0) + r.rate
            cnt[r.item] = cnt.get(r.item, 0) + 1 
        
        # 按照物品平均评分进行分组
        for item, v in vote.items():
            c = v / (cnt[item] * 1.0)
            self.group[item] = int(c*2)
        
    def GetGroup(self, iid):
        if iid not in self.group:
            return -1 
        else:
            return self.group[iid]

In [14]:
# 预测接口函数进行修改
def PredictAll(records, UserGroup, ItemGroup):
    userGroup = UserGroup(records)
    itemGroup = ItemGroup(records)
    
    group = {}
    for r in records:
        ug = userGroup.GetGroup(r.user)
        ig = itemGroup.GetGroup(r.item)
        if ug not in group:
            group[ug] = {}
        if ig not in group[ug]:
            group[ug][ig] = []
        # 这里计算残差
        group[ug][ig].append(r.rate - r.predict)
    
    for ug in group:
        for ig in group[ug]:
            group[ug][ig] = sum(group[ug][ig]) / (1.0 * len(group[ug][ig]) + 1.0)
    
    # 预测
    for i in range(len(records)):
        ug = userGroup.GetGroup(records[i].user)
        ig = itemGroup.GetGroup(records[i].item)
        # 这里将之前的结果进行结合
        records[i].predict += group[ug][ig]

In [19]:
class Experiment:
    def __init__(self, M, UserGroup, ItemGroup, fp="data/ml-1m/ratings.dat"):
        self.M = M 
        self.userGroup = UserGroup
        self.itemGroup = ItemGroup
        self.dataset = Dataset(fp)
        self.dataset.splitData(M, 0) 
    
    # 定义单次实验
    def worker(self, records):
        PredictAll(records, self.userGroup, self.itemGroup)
        metric = RMSE(records)
        return metric
    
    # 多次实验取平均
    @timmer
    def run(self):
        metric = self.worker(self.dataset.data)
        print("Result (UserGroup={}, ItemGroup={}): {}".format(self.userGroup.__name__,
                                                              self.itemGroup.__name__,
                                                              metric))

In [20]:
UserGroups = [Cluster, IdCluster, Cluster, UserActivityCluster, UserActivityCluster, Cluster, IdCluster,
             UserActivityCluster, UserVoteCluster, UserVoteCluster, Cluster, IdCluster, UserVoteCluster]
ItemGroups = [Cluster, Cluster, IdCluster, Cluster, IdCluster, ItemPopularityCluster, ItemPopularityCluster,
             ItemPopularityCluster, Cluster, IdCluster, ItemVoteCluster, ItemVoteCluster, ItemVoteCluster]

M = 10 
exp = Experiment(M, None, None)
for i in range(len(UserGroups)):
    exp.userGroup = UserGroups[i]
    exp.itemGroup = ItemGroups[i]
    exp.run()

Result (UserGroup=Cluster, ItemGroup=Cluster): {'train_rmse': 1.1163419044799097, 'test_rmse': 1.1239098429180596}
Func: run | Run time: 0.80
Result (UserGroup=IdCluster, ItemGroup=Cluster): {'train_rmse': 1.0279720784446358, 'test_rmse': 1.0328163104921269}
Func: run | Run time: 1.15
Result (UserGroup=Cluster, ItemGroup=IdCluster): {'train_rmse': 0.9187033847868101, 'test_rmse': 0.921579422992697}
Func: run | Run time: 0.97
Result (UserGroup=UserActivityCluster, ItemGroup=Cluster): {'train_rmse': 0.9187031894922616, 'test_rmse': 0.921576615516696}
Func: run | Run time: 1.04
Result (UserGroup=UserActivityCluster, ItemGroup=IdCluster): {'train_rmse': 0.9186623607307105, 'test_rmse': 0.9215180703205945}
Func: run | Run time: 1.17
Result (UserGroup=Cluster, ItemGroup=ItemPopularityCluster): {'train_rmse': 0.9186620874608045, 'test_rmse': 0.9215186108542568}
Func: run | Run time: 1.24
Result (UserGroup=IdCluster, ItemGroup=ItemPopularityCluster): {'train_rmse': 0.8856866647867953, 'test_rm