In [4]:
# -*- coding: utf-8 -*-

'''
实现隐语义模型，对隐式数据进行推荐
1.对正样本生成负样本
  -负样本数量相当于正样本
  -物品越热门，越有可能成为负样本
2.使用随机梯度下降法，更新参数
'''

import pandas as pd
import time
import random

class LFM:
    def __init__(self, data, ratio, F=5, N=10, alpha=0.02, lamda=0.01, topk=10):
        self.data = data  # 样本集
        self.ratio = ratio  # 正负样例比率，对性能最大影响
        self.topk = topk  # 推荐top k项

    '''
                初始化物品池，物品池中物品出现的次数与其流行度成正比
    {item1:次数,item2:次数,...}
    '''

    def InitItemPool(self):
        itemPool = dict()
        groups = self.data.groupby('book_id')
        for item, group in groups:
            itemPool.setdefault(item, 0)
            itemPool[item] = group.shape[0]
        itemPool = dict(sorted(itemPool.items(), key=lambda x: x[1], reverse=True))
        return itemPool



    '''
                获取每个用户对应的书籍（用户借阅过的书籍）列表，如
    {用户1:[图书A，图书B，图书C],
                 用户2:[图书D，图书E，图书F]...}
    '''

    def user_item(self):
        ui = dict()
        groups = self.data.groupby('user_id')
        for item, group in groups:
            ui[item] = set(group.ix[:, 'book_id'])
        return ui

    '''
        生成负样本
    '''

    def RandSelectNegativeSamples(self,items):   # 为一个用户选择负样本
        ret = dict()
        negtiveNum = int(round(len(items) * self.ratio))
        N = 0
        for item, count in self.itemPool.items():
            if N > negtiveNum-1:
                break
            if item in items:
                # 如果在用户已经喜欢的图书列表中，继续选
                continue
            N += 1
            # 负样本评分在（-0.5,0）之间随机产生
            ret[item] = random.uniform(-0.5,0)
        return ret


    def Train(self):    # 最后生成正样本、负样本均衡的数据，用于生成用户——图书矩阵。
        self.itemPool = self.InitItemPool()  # 生成图书的热门度排行
        self.ui = self.user_item()  # 生成用户-图书
        all_user_neg=[]
        num=0
        for user,items in self.ui.items():
            num+=len(items)
            user_neg=self.RandSelectNegativeSamples(items)
            negtive_ui=[{'user_id':user,'book_id':book_id,'final_score':score} for book_id,score in user_neg.items()]
            all_user_neg.extend(negtive_ui)
        print('在寻找负样本时用到的正样本的数目为{}'.format(num))
        print('用户数目为：{}'.format(len(self.ui)))
        return all_user_neg

if __name__ == "__main__":
    start = time.clock()

    # 导入数据
    train_data = pd.read_csv('user_book_score_time\\user_book_score_time_19_4VS5.csv',usecols=['user_id','book_id','final_score'])
    print('正样本的数目为{}'.format(train_data.shape[0]))
    lfm = LFM(train_data,ratio=1)
    all_user_neg=lfm.Train()
    end = time.clock()
    print('finish all in %s' % str(end - start))
    data_neg=pd.DataFrame(all_user_neg)
    print('负样本前5行如下：')
    print(data_neg.head())
    print('负样本长度如下：')
    print(len(all_user_neg))
    final_train_data=pd.concat([train_data,data_neg])
    final_train_data.to_csv('positive_negtive_data\positive_negtive_data_19_4VS5.csv',index=False)


正样本的数目为225
在寻找负样本时用到的正样本的数目为225
用户数目为：142
finish all in 0.06831015264131679
负样本前5行如下：
   book_id  final_score  user_id
0        0    -0.013430        0
1        1    -0.187596        0
2        2    -0.449421        0
3        0    -0.392654        1
4        1    -0.078877        1
负样本长度如下：
225


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


