In [1]:
import pandas as pd
import numpy as np
from d2l import torch as d2l

def data_split(data_path, x=0.8, random=False):
    print("读取文件中...")
    timer = d2l.Timer()
    timer.start()
    f = open(data_path, 'r')
    temp = []
    while True:
        line = f.readline()
        if line == '':
            break
        user, num = line.split('|')
        for i in range(int(num)):
            line = f.readline()
            item, rating = line.split('  ')
            temp.append([int(user), int(item), int(rating)])

    ratings = pd.DataFrame(temp)
    ratings.rename(columns={0: 'userId', 1: 'movieId', 2: 'rating'}, inplace=True)
    print('数据大小:{}'.format(ratings.shape))
    print("文件读取完成，正在切分数据集...")
    validation_index = []
    for uid in ratings.groupby("userId").any().index:
        user_rating_data = ratings.where(ratings["userId"] == uid).dropna()
        if random:
            index = list(user_rating_data.index)
            np.random.shuffle(index)
            _index = round(len(user_rating_data) * x)
            validation_index += list(index[_index:])
        else:
            index = round(len(user_rating_data) * x)
            validation_index += list(user_rating_data.index.values[index:])

    validation_set = ratings.loc[validation_index]
    train_set = ratings.drop(validation_index)
    timer.stop()
    print('数据集切分完成，耗时 :{} sec'.format(timer.sum()))
    return train_set, validation_set


train_path = 'data-202205/train.txt'
test_path = 'data-202205/test.txt'
answer_path = 'answer/out.txt'
train, validation = data_split(train_path, random=False)

读取文件中...
数据大小:(5001507, 3)
文件读取完成，正在切分数据集...
数据集切分完成，耗时 :6185.30339550972 sec


In [None]:
def evaluate_accuracy(predict_results):
    metric = d2l.Accumulator(3)
    for uid, iid, real_rating, pred_rating in predict_results:
        metric.add(1, (pred_rating - real_rating) ** 2, abs(pred_rating - real_rating))
    return round(np.sqrt(metric[1] / metric[0]), 4), round(metric[2] / metric[0], 4)


def predict_test(file_path, write_path, cf):
    f = open(file_path, 'r')
    b = open(write_path, 'w')
    while True:
        line = f.readline()
        if line == '':
            break
        b.write(line)
        user, num = line.split('|')
        for i in range(int(num)):
            line = f.readline().split('\n')[0]
            rating = cf.predict(int(user), int(line))
            b.write(line + '  ' + str(rating) + '\n')

In [2]:
class BiasSvd:

    def __init__(self, dataset, epochs, alpha, hidden, parameter_p, parameter_q, parameter_bu, parameter_bi, columns):
        self.dataset = dataset
        self.epochs = epochs
        self.alpha = alpha
        self.hidden = hidden
        self.parameter_p = parameter_p
        self.parameter_q = parameter_q
        self.parameter_bu = parameter_bu
        self.parameter_bi = parameter_bi
        self.columns = columns
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        self.global_mean = self.dataset[self.columns[2]].mean()
        self.bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        self.bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))
        self.P = dict(zip(
            self.users_ratings.index,
            np.random.rand(len(self.users_ratings), self.hidden).astype(np.float32)
        ))
        self.Q = dict(zip(
            self.items_ratings.index,
            np.random.rand(len(self.items_ratings), self.hidden).astype(np.float32)
        ))

    def train_bs(self, validation_set):
        animator = d2l.Animator(xlabel='epoch', xlim=[1, self.epochs], ylim=[0, 50],
                                legend=['train RMSE', 'val'])
        timer = d2l.Timer()
        for epoch in range(self.epochs):
            print('epoch :{}'.format(epoch))
            metric = d2l.Accumulator(2)
            timer.start()
            for i, (uid, iid, real_rating) in enumerate(self.dataset.itertuples(index=False)):
                vec_pu = self.P[uid]
                vec_qi = self.Q[iid]
                error = np.float32(
                    real_rating - (self.global_mean + self.bu[uid] + self.bi[iid] + np.dot(vec_pu, vec_qi)))
                vec_pu += self.alpha * (error * vec_qi - self.parameter_p * vec_pu)
                vec_qi += self.alpha * (error * vec_pu - self.parameter_q * vec_qi)
                self.P[uid] = vec_pu
                self.Q[iid] = vec_qi
                self.bu[uid] += self.alpha * (error - self.parameter_bu * self.bu[uid])
                self.bi[iid] += self.alpha * (error - self.parameter_bi * self.bi[iid])
                metric.add(1, error ** 2)
            timer.stop()
            pred_results = self.validate(validation_set)
            rmse, mae = evaluate_accuracy(pred_results)
            #print(rmse, mae)

            animator.add(epoch + 1, (round(np.sqrt(metric[1] / metric[0]), 4), rmse))
        print('training time :{}'.format(timer.sum()))
        # d2l.plt.show()

    def predict(self, uid, iid):
        if uid not in self.users_ratings.index or iid not in self.items_ratings.index:
            return self.global_mean
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid] + np.dot(self.P[uid], self.Q[iid])
        if predict_rating > 100:
            predict_rating = 100
        if predict_rating < 0:
            predict_rating = 0
        return predict_rating

    def validate(self, validation_set):
        for uid, iid, real_rating in validation_set.itertuples(index=False):
            try:
                pred_rating = self.predict(uid, iid)
            except Exception as e:
                print(e)
            else:
                yield uid, iid, real_rating, pred_rating

In [1]:
class BaselineCF:

    def __init__(self, dataset, epochs, parameter_bu, parameter_bi, columns):
        self.dataset = dataset
        self.epochs = epochs
        self.parameter_bu = parameter_bu
        self.parameter_bi = parameter_bi
        self.columns = columns
        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        self.global_mean = self.dataset[self.columns[2]].mean()
        self.bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        self.bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))

    def train_bl(self, validation_set):
        animator = d2l.Animator(xlabel='epoch', xlim=[1, self.epochs], ylim=[0, 50],
                                legend=['train RMSE', 'validation RMSE'])
        #timer = d2l.Timer()
        # for epoch in range(self.epochs):
        #     print('epoch :{}'.format(epoch))
        #     metric = d2l.Accumulator(2)
        #     timer.start()
        #     for i, (uid, iid, real_rating) in enumerate(self.dataset.itertuples(index=False)):
        #         error = real_rating - (self.global_mean + self.bu[uid] + self.bi[iid])
        #
        #         self.bu[uid] += self.alpha * (error - self.parameter * self.bu[uid])
        #         self.bi[iid] += self.alpha * (error - self.parameter * self.bi[iid])
        #         metric.add(1, error ** 2)
        #     timer.stop()
        #     pred_results = self.validate(validation_set)
        #     rmse, mae = evaluate_accuracy(pred_results)
        #     animator.add(epoch + 1, (round(np.sqrt(metric[1] / metric[0]), 4), rmse))

        for i in range(self.epochs):
            for iid, uids, ratings in self.items_ratings.itertuples(index=True):
                _sum = 0
                for uid, rating in zip(uids, ratings):
                    _sum += rating - self.global_mean - self.bu[uid]
                self.bi[iid] = _sum / (self.parameter_bi + len(uids))

            for uid, iids, ratings in self.users_ratings.itertuples(index=True):
                _sum = 0
                for iid, rating in zip(iids, ratings):
                    _sum += rating - self.global_mean - self.bi[iid]
                self.bu[uid] = _sum / (self.parameter_bu + len(iids))

            pred_results = self.validate(validation_set)
            rmse, mae = evaluate_accuracy(pred_results)
            print(rmse, mae)
            animator.add(i + 1, (mae, rmse))
        d2l.plt.show()

    def predict(self, uid, iid):
        if iid not in self.items_ratings.index:
            return 0
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        if predict_rating > 100:
            predict_rating = 100
        if predict_rating < 0:
            predict_rating = 0
        return predict_rating

    def validate(self, validation_set):
        for uid, iid, real_rating in validation_set.itertuples(index=False):
            try:
                pred_rating = self.predict(uid, iid)
            except Exception as e:
                print(e)
            else:
                yield uid, iid, real_rating, pred_rating

In [2]:
    bs = BiasSvd(train, 15, 0.0005, 80, 0.1, 0.1, 0.1, 0.1, ['userId', 'movieId', 'rating'])
    bs.train_bs(validation)

    bl = BaselineCF(train, 10, 0.3, 0, ['userId', 'movieId', 'rating'])
    bl.train_bl(validation)

    predict_test(test_path, answer_path, bl, bs)

NameError: name 'BiasSvd' is not defined